Index: include/polly/CodeGen/IslNodeBuilder.h =================================================================== --- include/polly/CodeGen/IslNodeBuilder.h +++ include/polly/CodeGen/IslNodeBuilder.h @@ -203,6 +203,10 @@ virtual void createMark(__isl_take isl_ast_node *Marker); virtual void createFor(__isl_take isl_ast_node *For); + /// @brief Preload the memory access at @p AccessRange with @p Build. + Value *preloadUnconditionally(__isl_take isl_set *AccessRange, + isl_ast_build *Build); + /// @brief Preload the memory load access @p MA. /// /// If @p MA is not always executed it will be conditionally loaded and Index: include/polly/ScopDetection.h =================================================================== --- include/polly/ScopDetection.h +++ include/polly/ScopDetection.h @@ -48,6 +48,7 @@ #define POLLY_SCOP_DETECTION_H #include "polly/ScopDetectionDiagnostic.h" +#include "polly/Support/ScopHelper.h" #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" @@ -145,6 +146,9 @@ using BoxedLoopsMapTy = DenseMap; BoxedLoopsMapTy BoxedLoopsMap; + /// @brief Map to remember loads that are required to be invariant. + DenseMap RequiredInvariantLoadsMap; + /// @brief Context variables for SCoP detection. struct DetectionContext { Region &CurRegion; // The region to check. @@ -180,12 +184,15 @@ /// @brief The set of loops contained in non-affine regions. BoxedLoopsSetTy &BoxedLoopsSet; + /// @brief Loads that need to be invariant during execution. + InvariantLoadsClassesTy &RequiredILC; + DetectionContext(Region &R, AliasAnalysis &AA, NonAffineSubRegionSetTy &NASRS, BoxedLoopsSetTy &BLS, - bool Verify) + InvariantLoadsClassesTy &RequiredILC, bool Verify) : CurRegion(R), AST(AA), Verifying(Verify), Log(&R), hasLoads(false), hasStores(false), hasAffineLoops(false), NonAffineSubRegionSet(NASRS), - BoxedLoopsSet(BLS) {} + BoxedLoopsSet(BLS), RequiredILC(RequiredILC) {} }; // Remember the valid regions @@ -244,6 +251,18 @@ /// @return True if the call instruction is valid, false otherwise. static bool isValidCallInst(CallInst &CI); + /// @brief Check if the required invariant loads can be hoisted. + /// + /// If true is returned the loads are added to the required invariant loads + /// contained in the @p Context. + /// + /// @param RequiredILC The loads to check. + /// @param Context The current detection context. + /// + /// @return True if all loads can be assumed invariant. + bool onlyValidRequiredInvariantLoads(InvariantLoadsClassesTy &RequiredILC, + DetectionContext &Context) const; + /// @brief Check if a value is invariant in the region Reg. /// /// @param Val Value to check for invariance. @@ -301,6 +320,18 @@ bool isValidBranch(BasicBlock &BB, BranchInst *BI, Value *Condition, DetectionContext &Context) const; + /// @brief Check if the SCEV @p S is affine in the current @p Context. + /// + /// This will also use a heuristic to decide if we want to require loads to be + /// invariant to make the expression affine or if we want to treat is as + /// non-affine. + /// + /// @param S The expression to be checked. + /// @param Context The context of scop detection. + /// @param BaseAddress The base address of the expression @p S (if any). + bool isAffine(const SCEV *S, DetectionContext &Context, + Value *BaseAddress = nullptr) const; + /// @brief Check if the control flow in a basic block is valid. /// /// @param BB The BB to check the control flow. @@ -368,6 +399,10 @@ /// @brief Return the set of loops in non-affine subregions for @p R. const BoxedLoopsSetTy *getBoxedLoops(const Region *R) const; + /// @brief Return the set of required invariant loads for @p R. + const InvariantLoadsClassesTy * + getRequiredInvariantLoads(const Region *R) const; + /// @brief Return true if @p SubR is a non-affine subregion in @p ScopR. bool isNonAffineSubRegion(const Region *SubR, const Region *ScopR) const; Index: include/polly/ScopInfo.h =================================================================== --- include/polly/ScopInfo.h +++ include/polly/ScopInfo.h @@ -699,8 +699,11 @@ /// @brief Type for invariant memory accesses and their domain context. using InvariantAccessTy = std::pair; +/// @brief Type for multiple equivalent invariant memory accesses. +using InvariantAccessListTy = std::forward_list; + /// @brief Type for multiple invariant memory accesses and their domain context. -using InvariantAccessesTy = SmallVector; +using InvariantAccessesTy = SmallVector; ///===----------------------------------------------------------------------===// /// @brief Statement of the Scop @@ -925,12 +928,12 @@ BB = Block; } - /// @brief Move the memory access in @p InvMAs to @p TargetList. + /// @brief Move the memory access in @p InvMAs to @p InvariantAccesses. /// /// Note that scalar accesses that are caused by any access in @p InvMAs will /// be eliminated too. void hoistMemoryAccesses(MemoryAccessList &InvMAs, - InvariantAccessesTy &TargetList); + InvariantAccessesTy &InvariantAccesses); typedef MemoryAccessVec::iterator iterator; typedef MemoryAccessVec::const_iterator const_iterator; @@ -1027,6 +1030,9 @@ DominatorTree &DT; ScalarEvolution *SE; + /// @brief The ScopDetection to access the required invariant loads. + ScopDetection &SD; + /// The underlying Region. Region &R; @@ -1153,8 +1159,9 @@ InvariantAccessesTy InvariantAccesses; /// @brief Scop constructor; invoked from ScopInfo::buildScop. - Scop(Region &R, AccFuncMapType &AccFuncMap, ScalarEvolution &SE, - DominatorTree &DT, isl_ctx *ctx, unsigned MaxLoopDepth); + Scop(Region &R, AccFuncMapType &AccFuncMap, ScopDetection &SD, + ScalarEvolution &SE, DominatorTree &DT, isl_ctx *ctx, + unsigned MaxLoopDepth); /// @brief Initialize this ScopInfo . void init(LoopInfo &LI, ScopDetection &SD, AliasAnalysis &AA); @@ -1209,9 +1216,10 @@ /// /// At the moment we perform the following simplifications: /// - removal of empty statements (due to invariant load hoisting) + /// - consolidation of invariant loads of the same address. void simplifySCoP(); - /// @brief Hoist all invariant memory loads. + /// @brief Hoist invariant memory loads and check for required ones. void hoistInvariantLoads(); /// @brief Build the Context of the Scop. @@ -1229,6 +1237,9 @@ /// @brief Simplify the assumed and boundary context. void simplifyContexts(); + /// @brief Return the required invariant load equivalence classes. + const InvariantLoadsClassesTy &getRIL() const; + /// @brief Create a new SCoP statement for either @p BB or @p R. /// /// Either @p BB or @p R should be non-null. A new statement for the non-null @@ -1284,6 +1295,7 @@ //@} ScalarEvolution *getSE() const; + ScopDetection &getSD() const { return SD; } /// @brief Get the count of parameters used in this Scop. /// @@ -1604,8 +1616,10 @@ /// @param L The parent loop of the instruction /// @param R The region on which to build the data access dictionary. /// @param BoxedLoops The set of loops that are overapproximated in @p R. + /// @param ScopRIL The required invariant loads equivalence classes. void buildMemoryAccess(Instruction *Inst, Loop *L, Region *R, - const ScopDetection::BoxedLoopsSetTy *BoxedLoops); + const ScopDetection::BoxedLoopsSetTy *BoxedLoops, + const InvariantLoadsClassesTy &ScopRIL); /// @brief Analyze and extract the cross-BB scalar dependences (or, /// dataflow dependencies) of an instruction. Index: include/polly/Support/SCEVAffinator.h =================================================================== --- include/polly/Support/SCEVAffinator.h +++ include/polly/Support/SCEVAffinator.h @@ -14,6 +14,7 @@ #ifndef POLLY_SCEV_AFFINATOR_H #define POLLY_SCEV_AFFINATOR_H +#include "polly/Support/ScopHelper.h" #include "llvm/ADT/DenseMap.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" @@ -51,11 +52,13 @@ /// @brief Translate a SCEV to an isl_pw_aff. /// - /// @param E he expression that is translated. - /// @param BB The block in which @p E is executed. + /// @param E The expression that is translated. + /// @param BB The block in which @p E is executed. + /// @param ILC The invariant loads equivalence classes of the SCoP. /// /// @returns The isl representation of the SCEV @p E in @p Domain. __isl_give isl_pw_aff *getPwAff(const llvm::SCEV *E, + const InvariantLoadsClassesTy &ILC, llvm::BasicBlock *BB = nullptr); /// @brief Compute the context in which integer wrapping is happending. Index: include/polly/Support/SCEVValidator.h =================================================================== --- include/polly/Support/SCEVValidator.h +++ include/polly/Support/SCEVValidator.h @@ -12,6 +12,7 @@ #ifndef POLLY_SCEV_VALIDATOR_H #define POLLY_SCEV_VALIDATOR_H +#include "polly/Support/ScopHelper.h" #include "llvm/ADT/SetVector.h" #include @@ -21,6 +22,7 @@ class ScalarEvolution; class Value; class Loop; +class LoadInst; } namespace polly { @@ -45,11 +47,12 @@ /// @param R The region in which we look for dependences. bool hasScalarDepsInsideRegion(const llvm::SCEV *S, const llvm::Region *R); bool isAffineExpr(const llvm::Region *R, const llvm::SCEV *Expression, - llvm::ScalarEvolution &SE, - const llvm::Value *BaseAddress = 0); + llvm::ScalarEvolution &SE, const llvm::Value *BaseAddress = 0, + InvariantLoadsClassesTy *ILC = nullptr); std::vector getParamsInAffineExpr(const llvm::Region *R, const llvm::SCEV *Expression, llvm::ScalarEvolution &SE, + const InvariantLoadsClassesTy &ILC, const llvm::Value *BaseAddress = 0); /// @brief Extract the constant factors from the multiplication @p M. Index: include/polly/Support/ScopHelper.h =================================================================== --- include/polly/Support/ScopHelper.h +++ include/polly/Support/ScopHelper.h @@ -15,11 +15,12 @@ #define POLLY_SUPPORT_IRHELPER_H #include "llvm/ADT/DenseMap.h" -#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/ADT/SetVector.h" namespace llvm { class Type; class Instruction; +class LoadInst; class LoopInfo; class Loop; class ScalarEvolution; @@ -42,6 +43,13 @@ typedef llvm::DenseMap ValueMapT; typedef llvm::SmallVector VectorValueMapT; +/// @brief Type for a __ordered__ set of invariant loads. +using InvariantLoadsSetTy = llvm::SetVector; + +/// @brief Type for equivalence classes of invariant loads. +using InvariantLoadsClassesTy = + llvm::DenseMap; + /// Temporary Hack for extended regiontree. /// /// @brief Cast the region to loop. @@ -129,5 +137,27 @@ /// /// @return The condition of @p TI and nullptr if none could be extracted. llvm::Value *getConditionFromTerminator(llvm::TerminatorInst *TI); + +/// @brief Check if @p LInst can be hoisted in @p R. +/// +/// @param LInst The load to check. +/// @param R The analyzed region. +/// @param LI The loop info. +/// @param SE The scalar evolution analysis. +/// +/// @return True if @p LInst can be hoisted in @p R. +bool isHoistableLoad(llvm::LoadInst *LInst, llvm::Region &R, llvm::LoopInfo &LI, + llvm::ScalarEvolution &SE); + +/// @brief Get the representing SCEV for invariant loads if applicable. +/// +/// @param S The SCEV to normalize. +/// @param SE The ScalarEvolution analysis. +/// @param ILC The invariant loads equivalence classes. +/// +/// @return The representing SCEV for invariant loads or @p S if none. +const llvm::SCEV * +normalizeInvariantLoadSCEV(const llvm::SCEV *S, llvm::ScalarEvolution &SE, + const InvariantLoadsClassesTy &ILC); } #endif Index: lib/Analysis/ScopDetection.cpp =================================================================== --- lib/Analysis/ScopDetection.cpp +++ lib/Analysis/ScopDetection.cpp @@ -51,7 +51,6 @@ #include "polly/ScopDetection.h" #include "polly/ScopDetectionDiagnostic.h" #include "polly/Support/SCEVValidator.h" -#include "polly/Support/ScopHelper.h" #include "polly/Support/ScopLocation.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" @@ -255,9 +254,10 @@ if (Verify) { BoxedLoopsSetTy DummyBoxedLoopsSet; NonAffineSubRegionSetTy DummyNonAffineSubRegionSet; + InvariantLoadsClassesTy DummyILC; DetectionContext Context(const_cast(R), *AA, DummyNonAffineSubRegionSet, DummyBoxedLoopsSet, - false /*verifying*/); + DummyILC, false /*verifying*/); return isValidRegion(Context); } @@ -299,15 +299,43 @@ return (AllowNonAffineSubLoops || Context.BoxedLoopsSet.empty()); } +bool ScopDetection::onlyValidRequiredInvariantLoads( + InvariantLoadsClassesTy &RequiredILC, DetectionContext &Context) const { + + for (const auto &RILEquivClass : RequiredILC) + for (LoadInst *LInst : RILEquivClass.second) + if (!isHoistableLoad(LInst, Context.CurRegion, *LI, *SE)) + return false; + + for (auto &EquivClass : RequiredILC) { + auto &ContextEquivClass = Context.RequiredILC[EquivClass.first]; + ContextEquivClass.insert(EquivClass.second.begin(), + EquivClass.second.end()); + } + + return true; +} + +bool ScopDetection::isAffine(const SCEV *S, DetectionContext &Context, + Value *BaseAddress) const { + + InvariantLoadsClassesTy AccessILC; + if (!isAffineExpr(&Context.CurRegion, S, *SE, BaseAddress, &AccessILC)) + return false; + + if (!onlyValidRequiredInvariantLoads(AccessILC, Context)) + return false; + + return true; +} + bool ScopDetection::isValidSwitch(BasicBlock &BB, SwitchInst *SI, Value *Condition, DetectionContext &Context) const { - Region &CurRegion = Context.CurRegion; - Loop *L = LI->getLoopFor(&BB); const SCEV *ConditionSCEV = SE->getSCEVAtScope(Condition, L); - if (!isAffineExpr(&CurRegion, ConditionSCEV, *SE)) + if (!isAffine(ConditionSCEV, Context)) if (!AllowNonAffineSubRegions || !addOverApproximatedRegion(RI->getRegionFor(&BB), Context)) return invalid(Context, /*Assert=*/true, &BB, @@ -319,8 +347,6 @@ bool ScopDetection::isValidBranch(BasicBlock &BB, BranchInst *BI, Value *Condition, DetectionContext &Context) const { - Region &CurRegion = Context.CurRegion; - // Non constant conditions of branches need to be ICmpInst. if (!isa(Condition)) { if (!AllowNonAffineSubRegions || @@ -352,8 +378,7 @@ const SCEV *LHS = SE->getSCEVAtScope(ICmp->getOperand(0), L); const SCEV *RHS = SE->getSCEVAtScope(ICmp->getOperand(1), L); - if (!isAffineExpr(&CurRegion, LHS, *SE) || - !isAffineExpr(&CurRegion, RHS, *SE)) { + if (!isAffine(LHS, Context) || !isAffine(RHS, Context)) { if (!AllowNonAffineSubRegions || !addOverApproximatedRegion(RI->getRegionFor(&BB), Context)) return invalid(Context, /*Assert=*/true, &BB, LHS, @@ -441,18 +466,6 @@ if (!isInvariant(*Operand, Reg)) return false; - // When the instruction is a load instruction, check that no write to memory - // in the region aliases with the load. - if (const LoadInst *LI = dyn_cast(I)) { - auto Loc = MemoryLocation::get(LI); - - // Check if any basic block in the region can modify the location pointed to - // by 'Loc'. If so, 'Val' is (likely) not invariant in the region. - for (const BasicBlock *BB : Reg.blocks()) - if (AA->canBasicBlockModify(*BB, Loc)) - return false; - } - return true; } @@ -536,7 +549,7 @@ const Instruction *Insn = Pair.first; const SCEV *AF = Pair.second; - if (!isAffineExpr(&CurRegion, AF, *SE, BaseValue)) { + if (!isAffine(AF, Context, BaseValue)) { invalid(Context, /*Assert=*/true, AF, Insn, BaseValue); if (!KeepGoing) @@ -563,7 +576,7 @@ MemAcc *Acc = &TempMemoryAccesses.find(Insn)->second; if (!AF) { - if (isAffineExpr(&CurRegion, Pair.second, *SE, BaseValue)) + if (isAffine(Pair.second, Context, BaseValue)) Acc->DelinearizedSubscripts.push_back(Pair.second); else IsNonAffine = true; @@ -573,7 +586,7 @@ if (Acc->DelinearizedSubscripts.size() == 0) IsNonAffine = true; for (const SCEV *S : Acc->DelinearizedSubscripts) - if (!isAffineExpr(&CurRegion, S, *SE, BaseValue)) + if (!isAffine(S, Context, BaseValue)) IsNonAffine = true; } @@ -644,11 +657,11 @@ if (PollyDelinearize && !isVariantInNonAffineLoop) { Context.Accesses[BasePointer].push_back({&Inst, AccessFunction}); - if (!isAffineExpr(&CurRegion, AccessFunction, *SE, BaseValue)) + if (!isAffine(AccessFunction, Context, BaseValue)) Context.NonAffineAccesses.insert(BasePointer); } else if (!AllowNonAffine) { if (isVariantInNonAffineLoop || - !isAffineExpr(&CurRegion, AccessFunction, *SE, BaseValue)) + !isAffine(AccessFunction, Context, BaseValue)) return invalid(Context, /*Assert=*/true, AccessFunction, &Inst, BaseValue); } @@ -682,9 +695,17 @@ // the beginning of the SCoP. This breaks if the base pointer is defined // inside the scop. Hence, we can only create a run-time check if we are // sure the base pointer is not an instruction defined inside the scop. + // However, if we can ignore loads that will be hoisted. for (const auto &Ptr : AS) { Instruction *Inst = dyn_cast(Ptr.getValue()); if (Inst && CurRegion.contains(Inst)) { + LoadInst *LInst = dyn_cast(Inst); + if (LInst && isHoistableLoad(LInst, CurRegion, *LI, *SE)) { + const SCEV *PointerSCEV = SE->getSCEV(LInst->getPointerOperand()); + Context.RequiredILC[PointerSCEV].insert(LInst); + continue; + } + CanBuildRunTimeCheck = false; break; } @@ -805,7 +826,8 @@ while (ExpandedRegion) { DetectionContext Context( *ExpandedRegion, *AA, NonAffineSubRegionMap[ExpandedRegion.get()], - BoxedLoopsMap[ExpandedRegion.get()], false /* verifying */); + BoxedLoopsMap[ExpandedRegion.get()], + RequiredInvariantLoadsMap[ExpandedRegion.get()], false /* verifying */); DEBUG(dbgs() << "\t\tTrying " << ExpandedRegion->getNameStr() << "\n"); // Only expand when we did not collect errors. @@ -871,7 +893,7 @@ void ScopDetection::findScops(Region &R) { DetectionContext Context(R, *AA, NonAffineSubRegionMap[&R], BoxedLoopsMap[&R], - false /*verifying*/); + RequiredInvariantLoadsMap[&R], false /*verifying*/); bool RegionIsValid = false; if (!DetectUnprofitable && regionWithoutLoops(R, LI)) { @@ -1110,14 +1132,23 @@ return &BLMIt->second; } +const InvariantLoadsClassesTy * +ScopDetection::getRequiredInvariantLoads(const Region *R) const { + auto RILMIt = RequiredInvariantLoadsMap.find(R); + if (RILMIt == RequiredInvariantLoadsMap.end()) + return nullptr; + return &RILMIt->second; +} + void polly::ScopDetection::verifyRegion(const Region &R) const { assert(isMaxRegionInScop(R) && "Expect R is a valid region."); BoxedLoopsSetTy DummyBoxedLoopsSet; NonAffineSubRegionSetTy DummyNonAffineSubRegionSet; + InvariantLoadsClassesTy DummyILC; DetectionContext Context(const_cast(R), *AA, DummyNonAffineSubRegionSet, DummyBoxedLoopsSet, - true /*verifying*/); + DummyILC, true /*verifying*/); isValidRegion(Context); } @@ -1151,6 +1182,7 @@ InsnToMemAcc.clear(); BoxedLoopsMap.clear(); NonAffineSubRegionMap.clear(); + RequiredInvariantLoadsMap.clear(); // Do not clear the invalid function set. } Index: lib/Analysis/ScopInfo.cpp =================================================================== --- lib/Analysis/ScopInfo.cpp +++ lib/Analysis/ScopInfo.cpp @@ -1075,6 +1075,10 @@ isl_local_space *LSpace = isl_local_space_from_space(getDomainSpace()); Type *Ty = GEP->getPointerOperandType(); ScalarEvolution &SE = *Parent.getSE(); + ScopDetection &SD = Parent.getSD(); + + // The set of loads that are required to be invariant. + auto &ScopRIL = *SD.getRequiredInvariantLoads(&Parent.getRegion()); std::vector Subscripts; std::vector Sizes; @@ -1093,7 +1097,17 @@ auto Expr = Subscripts[i + IndexOffset]; auto Size = Sizes[i]; - if (!isAffineExpr(&Parent.getRegion(), Expr, SE)) + InvariantLoadsClassesTy AccessRIL; + if (!isAffineExpr(&Parent.getRegion(), Expr, SE, nullptr, &AccessRIL)) + continue; + + bool NonAffine = false; + for (const auto EquivClass : AccessRIL) + for (LoadInst *LInst : EquivClass.second) + if (!ScopRIL.lookup(EquivClass.first).count(LInst)) + NonAffine = true; + + if (NonAffine) continue; isl_pw_aff *AccessOffset = getPwAff(Expr); @@ -1351,7 +1365,7 @@ void ScopStmt::dump() const { print(dbgs()); } void ScopStmt::hoistMemoryAccesses(MemoryAccessList &InvMAs, - InvariantAccessesTy &TargetList) { + InvariantAccessesTy &InvariantAccesses) { // Remove all memory accesses in @p InvMAs from this statement together // with all scalar accesses that were caused by them. The tricky iteration @@ -1384,8 +1398,54 @@ DomainCtx = isl_set_detect_equalities(DomainCtx); DomainCtx = isl_set_coalesce(DomainCtx); - for (MemoryAccess *MA : InvMAs) - TargetList.push_back(std::make_pair(MA, isl_set_copy(DomainCtx))); + Scop &S = *getParent(); + ScalarEvolution &SE = *S.getSE(); + + // Project out all parameters that relate to loads in this statement that + // we will hoist. Otherwise we would have cyclic dependences on the + // constraints under which the hoisted loads are executed. + for (MemoryAccess *MA : InvMAs) { + Instruction *AccInst = MA->getAccessInstruction(); + if (SE.isSCEVable(AccInst->getType())) { + isl_id *ParamId = S.getIdForParam(SE.getSCEV(AccInst)); + if (ParamId) { + int Dim = isl_set_find_dim_by_id(DomainCtx, isl_dim_param, ParamId); + DomainCtx = isl_set_eliminate(DomainCtx, isl_dim_param, Dim, 1); + } + isl_id_free(ParamId); + } + } + + for (MemoryAccess *MA : InvMAs) { + + // Check for another invariant access that accesses the same location as + // MA and if found consolidate them. Otherwise create a new equivalence + // class at the end of InvariantAccesses. + // TODO: This is quadratic in the number invariant accesses. + LoadInst *LInst = cast(MA->getAccessInstruction()); + const SCEV *PointerSCEV = SE.getSCEV(LInst->getPointerOperand()); + bool Consolidated = false; + + for (auto &IAClass : InvariantAccesses) { + InvariantAccessTy &IA = IAClass.front(); + LoadInst *IALInst = cast(IA.first->getAccessInstruction()); + const SCEV *IAPointerSCEV = SE.getSCEV(IALInst->getPointerOperand()); + if (PointerSCEV != IAPointerSCEV) + continue; + + isl_set *IAClassDomainCtx = + isl_set_union(IA.second, isl_set_copy(DomainCtx)); + IAClass.push_front(std::make_pair(MA, IAClassDomainCtx)); + Consolidated = true; + break; + } + + if (Consolidated) + continue; + + InvariantAccesses.emplace_back( + InvariantAccessListTy({std::make_pair(MA, isl_set_copy(DomainCtx))})); + } isl_set_free(DomainCtx); } @@ -1413,6 +1473,9 @@ } __isl_give isl_id *Scop::getIdForParam(const SCEV *Parameter) const { + // Normalize invariant loads first to get the correct parameter SCEV. + Parameter = normalizeInvariantLoadSCEV(Parameter, *getSE(), getRIL()); + ParamIdType::const_iterator IdIter = ParameterIds.find(Parameter); if (IdIter == ParameterIds.end()) @@ -2281,10 +2344,10 @@ return MaxLD - MinLD + 1; } -Scop::Scop(Region &R, AccFuncMapType &AccFuncMap, +Scop::Scop(Region &R, AccFuncMapType &AccFuncMap, ScopDetection &SD, ScalarEvolution &ScalarEvolution, DominatorTree &DT, isl_ctx *Context, unsigned MaxLoopDepth) - : DT(DT), SE(&ScalarEvolution), R(R), AccFuncMap(AccFuncMap), + : DT(DT), SE(&ScalarEvolution), SD(SD), R(R), AccFuncMap(AccFuncMap), IsOptimized(false), HasSingleExitEdge(R.getExitingBlock()), MaxLoopDepth(MaxLoopDepth), IslCtx(Context), Affinator(this), BoundaryContext(nullptr) {} @@ -2334,8 +2397,8 @@ } } - for (const auto &IA : InvariantAccesses) - isl_set_free(IA.second); + for (const auto &IAClass : InvariantAccesses) + isl_set_free(IAClass.front().second); } void Scop::updateAccessDimensionality() { @@ -2370,7 +2433,9 @@ // TODO: Loads that are not loop carried, hence are in a statement with // zero iterators, are by construction invariant, though we - // currently "hoist" them anyway. + // currently "hoist" them anyway. This is necessary because we allow + // them to be treated as parameters (e.g., in conditions) and our code + // generation would otherwise use the old value. isl_set *Domain = Stmt.getDomain(); MemoryAccessList InvMAs; @@ -2417,6 +2482,81 @@ if (!InvariantAccesses.empty()) IsOptimized = true; + + // Check required invariant loads that were tagged during SCoP detection. + for (const auto EquivClass : getRIL()) + for (LoadInst *LI : EquivClass.second) { + assert(LI && getRegion().contains(LI)); + ScopStmt *Stmt = getStmtForBasicBlock(LI->getParent()); + if (Stmt->lookupAccessesFor(LI) != nullptr) { + DEBUG(errs() << "\n\nWARNING: Load (" << *LI + << ") is required to be invariant but was not marked as " + "such. SCoP for " + << getRegion() << " will be dropped\n\n"); + addAssumption(isl_set_empty(getParamSpace())); + return; + } + } + + // We want invariant accesses to be sorted in a "natural order" because there + // might be dependences between invariant loads. These can be caused by + // indirect loads but also because an invariant load is only conditionally + // executed and the condition is dependent on another invariant load. As we + // want to do code generation in a straight forward way, e.g., preload the + // accesses in the list one after another, we sort them such that the + // preloaded values needed in the conditions will always be in front. Before + // we already ordered the accesses such that indirect loads can be resolved, + // thus we use a stable sort here. + + auto compareInvariantAccesses = [this]( + const InvariantAccessListTy &IAClass0, + const InvariantAccessListTy &IAClass1) { + const InvariantAccessTy &IA0 = IAClass0.front(); + const InvariantAccessTy &IA1 = IAClass1.front(); + + Instruction *AI0 = IA0.first->getAccessInstruction(); + Instruction *AI1 = IA1.first->getAccessInstruction(); + + const SCEV *S0 = + SE->isSCEVable(AI0->getType()) ? SE->getSCEV(AI0) : nullptr; + const SCEV *S1 = + SE->isSCEVable(AI1->getType()) ? SE->getSCEV(AI1) : nullptr; + + isl_id *Id0 = getIdForParam(S0); + isl_id *Id1 = getIdForParam(S1); + + if (Id0 && !Id1) { + isl_id_free(Id0); + isl_id_free(Id1); + return true; + } + + if (!Id0) { + isl_id_free(Id0); + isl_id_free(Id1); + return false; + } + + assert(Id0 && Id1); + + isl_set *Dom0 = IA0.second; + isl_set *Dom1 = IA1.second; + + int Dim0 = isl_set_find_dim_by_id(Dom0, isl_dim_param, Id0); + int Dim1 = isl_set_find_dim_by_id(Dom0, isl_dim_param, Id1); + + bool Involves0Id1 = isl_set_involves_dims(Dom0, isl_dim_param, Dim1, 1); + bool Involves1Id0 = isl_set_involves_dims(Dom1, isl_dim_param, Dim0, 1); + assert(!(Involves0Id1 && Involves1Id0)); + + isl_id_free(Id0); + isl_id_free(Id1); + + return Involves1Id0; + }; + + std::stable_sort(InvariantAccesses.begin(), InvariantAccesses.end(), + compareInvariantAccesses); } const ScopArrayInfo * @@ -2600,9 +2740,9 @@ OS.indent(4) << "Region: " << getNameStr() << "\n"; OS.indent(4) << "Max Loop Depth: " << getMaxLoopDepth() << "\n"; OS.indent(4) << "Invariant Accesses: {\n"; - for (const auto &IA : InvariantAccesses) { - IA.first->print(OS); - OS.indent(12) << "Execution Context: " << IA.second << "\n"; + for (const auto &IAClass : InvariantAccesses) { + IAClass.front().first->print(OS); + OS.indent(12) << "Execution Context: " << IAClass.front().second << "\n"; } OS.indent(4) << "}\n"; printContext(OS.indent(4)); @@ -2615,8 +2755,12 @@ isl_ctx *Scop::getIslCtx() const { return IslCtx; } +const InvariantLoadsClassesTy &Scop::getRIL() const { + return *SD.getRequiredInvariantLoads(&getRegion()); +} + __isl_give isl_pw_aff *Scop::getPwAff(const SCEV *E, BasicBlock *BB) { - return Affinator.getPwAff(E, BB); + return Affinator.getPwAff(E, getRIL(), BB); } __isl_give isl_union_set *Scop::getDomains() const { @@ -3044,7 +3188,8 @@ void ScopInfo::buildMemoryAccess( Instruction *Inst, Loop *L, Region *R, - const ScopDetection::BoxedLoopsSetTy *BoxedLoops) { + const ScopDetection::BoxedLoopsSetTy *BoxedLoops, + const InvariantLoadsClassesTy &ScopRIL) { unsigned Size; Type *SizeType; Value *Val; @@ -3091,11 +3236,19 @@ std::vector SizesSCEV; bool AllAffineSubcripts = true; - for (auto Subscript : Subscripts) - if (!isAffineExpr(R, Subscript, *SE)) { - AllAffineSubcripts = false; + for (auto Subscript : Subscripts) { + InvariantLoadsClassesTy AccessRIL; + AllAffineSubcripts = + isAffineExpr(R, Subscript, *SE, nullptr, &AccessRIL); + + for (const auto EquivClass : AccessRIL) + for (LoadInst *LInst : EquivClass.second) + if (!ScopRIL.lookup(EquivClass.first).count(LInst)) + AllAffineSubcripts = false; + + if (!AllAffineSubcripts) break; - } + } if (AllAffineSubcripts && Sizes.size() > 0) { for (auto V : Sizes) @@ -3129,8 +3282,15 @@ isVariantInNonAffineLoop = true; } - bool IsAffine = !isVariantInNonAffineLoop && - isAffineExpr(R, AccessFunction, *SE, BasePointer->getValue()); + InvariantLoadsClassesTy AccessRIL; + bool IsAffine = + !isVariantInNonAffineLoop && + isAffineExpr(R, AccessFunction, *SE, BasePointer->getValue(), &AccessRIL); + + for (const auto EquivClass : AccessRIL) + for (LoadInst *LInst : EquivClass.second) + if (!ScopRIL.lookup(EquivClass.first).count(LInst)) + IsAffine = false; // FIXME: Size of the number of bytes of an array element, not the number of // elements as probably intended here. @@ -3168,6 +3328,9 @@ // The set of loops contained in non-affine subregions that are part of R. const ScopDetection::BoxedLoopsSetTy *BoxedLoops = SD->getBoxedLoops(&R); + // The set of loads that are required to be invariant. + auto &ScopRIL = *SD->getRequiredInvariantLoads(&R); + for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I) { Instruction *Inst = I; @@ -3179,12 +3342,24 @@ if (!PHI && IsExitBlock) break; + // TODO: At this point we only know that elements of ScopRIL have to be + // invariant and will be hoisted for the SCoP to be processed. Though, + // there might be other invariant accesses that will be hoisted and + // that would allow to make a non-affine access affine. if (isa(Inst) || isa(Inst)) - buildMemoryAccess(Inst, L, &R, BoxedLoops); + buildMemoryAccess(Inst, L, &R, BoxedLoops, ScopRIL); if (isIgnoredIntrinsic(Inst)) continue; + // Do not build scalar dependences for required invariant loads as we will + // hoist them later on anyway or drop the SCoP if we cannot. + if (LoadInst *LInst = dyn_cast(Inst)) { + const SCEV *PointerSCEV = SE->getSCEV(LInst->getPointerOperand()); + if (ScopRIL.lookup(PointerSCEV).count(LInst)) + continue; + } + if (buildScalarDependences(Inst, &R, NonAffineSubRegion)) { if (!isa(Inst)) addScalarWriteAccess(Inst); @@ -3254,7 +3429,7 @@ void ScopInfo::buildScop(Region &R, DominatorTree &DT) { unsigned MaxLoopDepth = getMaxLoopDepthInRegion(R, *LI, *SD); - scop = new Scop(R, AccFuncMap, *SE, DT, ctx, MaxLoopDepth); + scop = new Scop(R, AccFuncMap, *SD, *SE, DT, ctx, MaxLoopDepth); buildAccessFunctions(R, R); Index: lib/CodeGen/BlockGenerators.cpp =================================================================== --- lib/CodeGen/BlockGenerators.cpp +++ lib/CodeGen/BlockGenerators.cpp @@ -730,9 +730,7 @@ Value *VectorBlockGenerator::generateUnknownStrideLoad( ScopStmt &Stmt, const LoadInst *Load, VectorValueMapT &ScalarMaps, - __isl_keep isl_id_to_ast_expr *NewAccesses - - ) { + __isl_keep isl_id_to_ast_expr *NewAccesses) { int VectorWidth = getVectorWidth(); const Value *Pointer = Load->getPointerOperand(); VectorType *VectorType = VectorType::get( Index: lib/CodeGen/CodeGeneration.cpp =================================================================== --- lib/CodeGen/CodeGeneration.cpp +++ lib/CodeGen/CodeGeneration.cpp @@ -144,9 +144,15 @@ BasicBlock *StartBlock = executeScopConditionally(S, this, Builder.getTrue()); auto SplitBlock = StartBlock->getSinglePredecessor(); + + // First generate code for the hoisted invariant loads and transitively the + // parameters they reference. Afterwards, for the remaining parameters that + // might reference the hoisted loads. Finally, build the runtime check + // that might reference both hoisted loads as well as parameters. Builder.SetInsertPoint(SplitBlock->getTerminator()); - NodeBuilder.addParameters(S.getContext()); NodeBuilder.preloadInvariantLoads(); + NodeBuilder.addParameters(S.getContext()); + Value *RTC = buildRTC(Builder, NodeBuilder.getExprBuilder()); Builder.GetInsertBlock()->getTerminator()->setOperand(0, RTC); Builder.SetInsertPoint(StartBlock->begin()); Index: lib/CodeGen/IslNodeBuilder.cpp =================================================================== --- lib/CodeGen/IslNodeBuilder.cpp +++ lib/CodeGen/IslNodeBuilder.cpp @@ -833,11 +833,8 @@ } } -/// @brief Create the actual preload memory access for @p MA. -static inline Value *createPreloadLoad(Scop &S, const MemoryAccess &MA, - isl_ast_build *Build, - IslExprBuilder &ExprBuilder) { - isl_set *AccessRange = isl_map_range(MA.getAccessRelation()); +Value *IslNodeBuilder::preloadUnconditionally(isl_set *AccessRange, + isl_ast_build *Build) { isl_pw_multi_aff *PWAccRel = isl_pw_multi_aff_from_set(AccessRange); PWAccRel = isl_pw_multi_aff_gist_params(PWAccRel, S.getContext()); isl_ast_expr *Access = @@ -849,15 +846,19 @@ isl_set *Domain, isl_ast_build *Build) { + isl_set *AccessRange = isl_map_range(MA.getAccessRelation()); + materializeParameters(AccessRange, false); + isl_set *Universe = isl_set_universe(isl_set_get_space(Domain)); bool AlwaysExecuted = isl_set_is_equal(Domain, Universe); isl_set_free(Universe); if (AlwaysExecuted) { isl_set_free(Domain); - return createPreloadLoad(S, MA, Build, ExprBuilder); + return preloadUnconditionally(AccessRange, Build); } else { + materializeParameters(Domain, false); isl_ast_expr *DomainCond = isl_ast_build_expr_from_set(Build, Domain); Value *Cond = ExprBuilder.create(DomainCond); @@ -890,7 +891,7 @@ Builder.SetInsertPoint(ExecBB->getTerminator()); Instruction *AccInst = MA.getAccessInstruction(); Type *AccInstTy = AccInst->getType(); - Value *PreAccInst = createPreloadLoad(S, MA, Build, ExprBuilder); + Value *PreAccInst = preloadUnconditionally(AccessRange, Build); Builder.SetInsertPoint(MergeBB->getTerminator()); auto *MergePHI = Builder.CreatePHI( @@ -904,8 +905,8 @@ void IslNodeBuilder::preloadInvariantLoads() { - const auto &InvAccList = S.getInvariantAccesses(); - if (InvAccList.empty()) + const auto &InvariantAccesses = S.getInvariantAccesses(); + if (InvariantAccesses.empty()) return; const Region &R = S.getRegion(); @@ -919,14 +920,16 @@ isl_ast_build *Build = isl_ast_build_from_context(isl_set_universe(S.getParamSpace())); - for (const auto &IA : InvAccList) { - MemoryAccess *MA = IA.first; + for (const auto &IAClass : InvariantAccesses) { + + MemoryAccess *MA = IAClass.front().first; assert(!MA->isImplicit()); - isl_set *Domain = isl_set_copy(IA.second); + isl_set *Domain = isl_set_copy(IAClass.front().second); Instruction *AccInst = MA->getAccessInstruction(); Value *PreloadVal = preloadInvariantLoad(*MA, Domain, Build); - ValueMap[AccInst] = PreloadVal; + for (const InvariantAccessTy &IA : IAClass) + ValueMap[IA.first->getAccessInstruction()] = PreloadVal; if (SE.isSCEVable(AccInst->getType())) { isl_id *ParamId = S.getIdForParam(SE.getSCEV(AccInst)); @@ -993,5 +996,5 @@ Value *IslNodeBuilder::generateSCEV(const SCEV *Expr) { Instruction *InsertLocation = --(Builder.GetInsertBlock()->end()); return expandCodeFor(S, SE, DL, "polly", Expr, Expr->getType(), - InsertLocation); + InsertLocation, &ValueMap); } Index: lib/Support/SCEVAffinator.cpp =================================================================== --- lib/Support/SCEVAffinator.cpp +++ lib/Support/SCEVAffinator.cpp @@ -33,8 +33,9 @@ isl_pw_aff_free(CachedPair.second); } -__isl_give isl_pw_aff *SCEVAffinator::getPwAff(const SCEV *Expr, - BasicBlock *BB) { +__isl_give isl_pw_aff * +SCEVAffinator::getPwAff(const SCEV *Expr, const InvariantLoadsClassesTy &ILC, + BasicBlock *BB) { this->BB = BB; if (BB) { @@ -44,7 +45,7 @@ } else NumIterators = 0; - S->addParams(getParamsInAffineExpr(&R, Expr, SE)); + S->addParams(getParamsInAffineExpr(&R, Expr, SE, ILC)); return visit(Expr); } Index: lib/Support/SCEVValidator.cpp =================================================================== --- lib/Support/SCEVValidator.cpp +++ lib/Support/SCEVValidator.cpp @@ -8,6 +8,7 @@ #include using namespace llvm; +using namespace polly; #define DEBUG_TYPE "polly-scev-validator" @@ -125,10 +126,12 @@ const Region *R; ScalarEvolution &SE; const Value *BaseAddress; + InvariantLoadsClassesTy *ILC; public: - SCEVValidator(const Region *R, ScalarEvolution &SE, const Value *BaseAddress) - : R(R), SE(SE), BaseAddress(BaseAddress) {} + SCEVValidator(const Region *R, ScalarEvolution &SE, const Value *BaseAddress, + InvariantLoadsClassesTy *ILC) + : R(R), SE(SE), BaseAddress(BaseAddress), ILC(ILC) {} class ValidatorResult visitConstant(const SCEVConstant *Constant) { return ValidatorResult(SCEVType::INT); @@ -335,6 +338,23 @@ return ValidatorResult(SCEVType::PARAM, S); } + ValidatorResult visitLoadInstruction(Instruction *I, const SCEV *S) { + if (R->contains(I) && ILC) { + // Insert I in the equivalence class of the pointer of I and use + // the first element of that equivalence class to determine the + // parameter that is used for I. + + LoadInst *LInst = cast(I); + const SCEV *PointerSCEV = SE.getSCEV(LInst->getPointerOperand()); + auto &EquivClass = (*ILC)[PointerSCEV]; + EquivClass.insert(cast(I)); + const SCEV *EquivClassSCEV = SE.getSCEV(EquivClass[0]); + return ValidatorResult(SCEVType::PARAM, EquivClassSCEV); + } + + return visitGenericInst(I, S); + } + ValidatorResult visitSDivInstruction(Instruction *SDiv, const SCEV *S) { assert(SDiv->getOpcode() == Instruction::SDiv && "Assumed SDiv instruction!"); @@ -391,6 +411,8 @@ if (Instruction *I = dyn_cast(Expr->getValue())) { switch (I->getOpcode()) { + case Instruction::Load: + return visitLoadInstruction(I, Expr); case Instruction::SDiv: return visitSDivInstruction(I, Expr); case Instruction::SRem: @@ -550,11 +572,11 @@ } bool isAffineExpr(const Region *R, const SCEV *Expr, ScalarEvolution &SE, - const Value *BaseAddress) { + const Value *BaseAddress, InvariantLoadsClassesTy *ILC) { if (isa(Expr)) return false; - SCEVValidator Validator(R, SE, BaseAddress); + SCEVValidator Validator(R, SE, BaseAddress, ILC); DEBUG({ dbgs() << "\n"; dbgs() << "Expr: " << *Expr << "\n"; @@ -573,14 +595,18 @@ return Result.isValid(); } -std::vector getParamsInAffineExpr(const Region *R, - const SCEV *Expr, - ScalarEvolution &SE, - const Value *BaseAddress) { +std::vector +getParamsInAffineExpr(const Region *R, const SCEV *Expr, ScalarEvolution &SE, + const InvariantLoadsClassesTy &ILC, + const Value *BaseAddress) { if (isa(Expr)) return std::vector(); - SCEVValidator Validator(R, SE, BaseAddress); + // TODO: The const cast is necessary since the Validator is also used to + // create the equivalence classes, however here we just want the + // parameters to be normalized with regards to ILC. + SCEVValidator Validator(R, SE, BaseAddress, + const_cast(&ILC)); ValidatorResult Result = Validator.visit(Expr); assert(Result.isValid() && "Requested parameters for an invalid SCEV!"); Index: lib/Support/ScopHelper.cpp =================================================================== --- lib/Support/ScopHelper.cpp +++ lib/Support/ScopHelper.cpp @@ -13,7 +13,6 @@ #include "polly/Support/ScopHelper.h" #include "polly/ScopInfo.h" -#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/RegionInfo.h" #include "llvm/Analysis/ScalarEvolution.h" @@ -368,3 +367,34 @@ return nullptr; } + +bool polly::isHoistableLoad(LoadInst *LInst, Region &R, LoopInfo &LI, + ScalarEvolution &SE) { + Loop *L = LI.getLoopFor(LInst->getParent()); + const SCEV *PtrSCEV = SE.getSCEVAtScope(LInst->getPointerOperand(), L); + while (L && R.contains(L)) { + if (!SE.isLoopInvariant(PtrSCEV, L)) + return false; + L = L->getParentLoop(); + } + + return true; +} + +const SCEV * +polly::normalizeInvariantLoadSCEV(const SCEV *S, ScalarEvolution &SE, + const InvariantLoadsClassesTy &ILC) { + const SCEVUnknown *SU = dyn_cast_or_null(S); + if (!SU) + return S; + + LoadInst *LInst = dyn_cast(SU->getValue()); + if (!LInst) + return S; + + const auto &EquivClass = ILC.lookup(SE.getSCEV(LInst->getPointerOperand())); + if (EquivClass.empty()) + return S; + + return SE.getSCEV(*EquivClass.begin()); +} Index: test/Isl/CodeGen/invariant_load_base_pointer.ll =================================================================== --- /dev/null +++ test/Isl/CodeGen/invariant_load_base_pointer.ll @@ -0,0 +1,38 @@ +; RUN: opt %loadPolly -polly-no-early-exit -polly-codegen -polly-ignore-aliasing -polly-detect-unprofitable -S < %s | FileCheck %s +; +; CHECK-LABEL: polly.preload.begin: +; CHECK-NEXT: %polly.access.BPLoc = getelementptr i32*, i32** %BPLoc, i64 0 +; CHECK-NEXT: %polly.access.BPLoc.load = load i32*, i32** %polly.access.BPLoc +; +; CHECK-LABEL: polly.stmt.bb2: +; CHECK-NEXT: %p_tmp3 = getelementptr inbounds i32, i32* %polly.access.BPLoc.load, i64 %polly.indvar +; +; void f(int **BPLoc) { +; for (int i = 0; i < 1024; i++) +; (*BPLoc)[i] = 0; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32** %BPLoc) { +bb: + br label %bb1 + +bb1: ; preds = %bb4, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb4 ], [ 0, %bb ] + %exitcond = icmp ne i64 %indvars.iv, 1024 + br i1 %exitcond, label %bb2, label %bb5 + +bb2: ; preds = %bb1 + %tmp = load i32*, i32** %BPLoc, align 8 + %tmp3 = getelementptr inbounds i32, i32* %tmp, i64 %indvars.iv + store i32 0, i32* %tmp3, align 4 + br label %bb4 + +bb4: ; preds = %bb2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb1 + +bb5: ; preds = %bb1 + ret void +} Index: test/Isl/CodeGen/invariant_load_base_pointer_conditional.ll =================================================================== --- /dev/null +++ test/Isl/CodeGen/invariant_load_base_pointer_conditional.ll @@ -0,0 +1,61 @@ +; RUN: opt %loadPolly -polly-no-early-exit -polly-codegen -polly-ignore-aliasing -polly-detect-unprofitable -S < %s | FileCheck %s +; +; CHECK-LABEL: polly.preload.begin: +; CHECK-NEXT: %0 = sext i32 %N to i64 +; CHECK-NEXT: %1 = icmp sge i64 %0, 514 +; CHECK-NEXT: br label %polly.preload.cond +; +; CHECK-LABEL: polly.preload.cond: +; CHECK-NEXT: br i1 %1, label %polly.preload.exec, label %polly.preload.merge +; +; CHECK-LABEL: polly.preload.merge: +; CHECK-NEXT: %polly.preload.tmp6.merge = phi i32* [ %polly.access.BPLoc.load, %polly.preload.exec ], [ null, %polly.preload.cond ] +; +; CHECK-LABEL: polly.stmt.bb5: +; CHECK-NEXT: %p_tmp7 = getelementptr inbounds i32, i32* %polly.preload.tmp6.merge, i64 %polly.indvar6 +; +; void f(int **BPLoc, int *A, int N) { +; for (int i = 0; i < N; i++) +; if (i > 512) +; (*BPLoc)[i] = 0; +; else +; A[i] = 0; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32** %BPLoc, i32* %A, i32 %N) { +bb: + %tmp = sext i32 %N to i64 + br label %bb1 + +bb1: ; preds = %bb11, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb11 ], [ 0, %bb ] + %tmp2 = icmp slt i64 %indvars.iv, %tmp + br i1 %tmp2, label %bb3, label %bb12 + +bb3: ; preds = %bb1 + %tmp4 = icmp sgt i64 %indvars.iv, 512 + br i1 %tmp4, label %bb5, label %bb8 + +bb5: ; preds = %bb3 + %tmp6 = load i32*, i32** %BPLoc, align 8 + %tmp7 = getelementptr inbounds i32, i32* %tmp6, i64 %indvars.iv + store i32 0, i32* %tmp7, align 4 + br label %bb10 + +bb8: ; preds = %bb3 + %tmp9 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 0, i32* %tmp9, align 4 + br label %bb10 + +bb10: ; preds = %bb8, %bb5 + br label %bb11 + +bb11: ; preds = %bb10 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb1 + +bb12: ; preds = %bb1 + ret void +} Index: test/Isl/CodeGen/invariant_load_condition.ll =================================================================== --- /dev/null +++ test/Isl/CodeGen/invariant_load_condition.ll @@ -0,0 +1,54 @@ +; RUN: opt %loadPolly -polly-detect-unprofitable -polly-codegen -S < %s | FileCheck %s +; +; CHECK-LABEL: polly.preload.begin: +; CHECK-NEXT: %polly.access.C = getelementptr i32, i32* %C, i64 0 +; CHECK-NEXT: %polly.access.C.load = load i32, i32* %polly.access.C +; CHECK-NOT: %polly.access.C.load = load i32, i32* %polly.access.C +; +; CHECK: polly.cond +; CHECK: %[[R0:[0-9]*]] = sext i32 %polly.access.C.load to i64 +; CHECK: %[[R1:[0-9]*]] = icmp sle i64 %[[R0]], -1 +; +; CHECK: polly.cond +; CHECK: %[[R2:[0-9]*]] = sext i32 %polly.access.C.load to i64 +; CHECK: %[[R3:[0-9]*]] = icmp sge i64 %[[R2]], 1 +; +; CHECK-NOT: polly.stmt.bb2 +; +; void f(int *A, int *C) { +; for (int i = 0; i < 1024; i++) +; if (*C) +; A[i] = 0; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* %A, i32* %C) { +bb: + br label %bb1 + +bb1: ; preds = %bb7, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb7 ], [ 0, %bb ] + %exitcond = icmp ne i64 %indvars.iv, 1024 + br i1 %exitcond, label %bb2, label %bb8 + +bb2: ; preds = %bb1 + %tmp = load i32, i32* %C, align 4 + %tmp3 = icmp eq i32 %tmp, 0 + br i1 %tmp3, label %bb6, label %bb4 + +bb4: ; preds = %bb2 + %tmp5 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 0, i32* %tmp5, align 4 + br label %bb6 + +bb6: ; preds = %bb2, %bb4 + br label %bb7 + +bb7: ; preds = %bb6 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb1 + +bb8: ; preds = %bb1 + ret void +} Index: test/Isl/CodeGen/invariant_load_escaping_second_scop.ll =================================================================== --- /dev/null +++ test/Isl/CodeGen/invariant_load_escaping_second_scop.ll @@ -0,0 +1,69 @@ +; RUN: opt %loadPolly -polly-codegen -polly-no-early-exit -polly-detect-unprofitable -S < %s | FileCheck %s +; +; void fence(void); +; +; void f(int *A, int *B) { +; int i = 0; +; int x = 0; +; +; do { +; x = *B; +; S: A[i] += x; +; } while (i++ < 100); +; +; fence(); +; +; do { +; P: A[i]++; +; } while (i++ < x / 2); +; } +; +; CHECK: polly.start: +; CHECK-NEXT: sext i32 %tmp.merge to i64 +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* %A, i32* %B) { +entry: + br label %stmt.S + +stmt.S: ; preds = %do.cond, %entry + %indvars.iv2 = phi i64 [ %indvars.iv.next3, %do.cond ], [ 0, %entry ] + %tmp = load i32, i32* %B, align 4 + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv2 + %tmp4 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %tmp4, %tmp + store i32 %add, i32* %arrayidx, align 4 + br label %do.cond + +do.cond: ; preds = %do.body + %indvars.iv.next3 = add nuw nsw i64 %indvars.iv2, 1 + %exitcond = icmp ne i64 %indvars.iv.next3, 101 + br i1 %exitcond, label %stmt.S, label %do.end + +do.end: ; preds = %do.cond + %tmp5 = trunc i64 101 to i32 + call void @fence() #2 + %tmp6 = sext i32 %tmp5 to i64 + br label %stmt.P + +stmt.P: ; preds = %do.cond.5, %do.end + %indvars.iv = phi i64 [ %indvars.iv.next, %do.cond.5 ], [ %tmp6, %do.end ] + %arrayidx3 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp7 = load i32, i32* %arrayidx3, align 4 + %inc4 = add nsw i32 %tmp7, 1 + store i32 %inc4, i32* %arrayidx3, align 4 + br label %do.cond.5 + +do.cond.5: ; preds = %do.body.1 + %div = sdiv i32 %tmp, 2 + %tmp8 = sext i32 %div to i64 + %cmp7 = icmp slt i64 %indvars.iv, %tmp8 + %indvars.iv.next = add i64 %indvars.iv, 1 + br i1 %cmp7, label %stmt.P, label %do.end.8 + +do.end.8: ; preds = %do.cond.5 + ret void +} + +declare void @fence() Index: test/Isl/CodeGen/invariant_load_loop_ub.ll =================================================================== --- /dev/null +++ test/Isl/CodeGen/invariant_load_loop_ub.ll @@ -0,0 +1,34 @@ +; RUN: opt %loadPolly -polly-codegen -polly-detect-unprofitable -S < %s | FileCheck %s +; +; CHECK: polly.start +; +; void f(int *A, int *UB) { +; for (int i = 0; i < *UB; i++) +; A[i] = 0; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* %A, i32* %UB) { +bb: + br label %bb1 + +bb1: ; preds = %bb6, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb6 ], [ 0, %bb ] + %tmp = load i32, i32* %UB, align 4 + %tmp2 = sext i32 %tmp to i64 + %tmp3 = icmp slt i64 %indvars.iv, %tmp2 + br i1 %tmp3, label %bb4, label %bb7 + +bb4: ; preds = %bb1 + %tmp5 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 0, i32* %tmp5, align 4 + br label %bb6 + +bb6: ; preds = %bb4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb1 + +bb7: ; preds = %bb1 + ret void +} Index: test/Isl/CodeGen/invariant_load_parameters_cyclic_dependence.ll =================================================================== --- /dev/null +++ test/Isl/CodeGen/invariant_load_parameters_cyclic_dependence.ll @@ -0,0 +1,75 @@ +; RUN: opt %loadPolly -polly-scops -analyze < %s | FileCheck %s --check-prefix=SCOP +; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s +; +; This caused the code generation to emit a broken module as there are two +; dependences that need to be considered, thus code has to be emitted in a +; certain order: +; 1) To preload A[N * M] the expression N * M [p0] is needed (both for the +; condition under which A[N * M] is executed as well as to compute the +; index). +; 2) To generate (A[N * M] / 2) [p1] the preloaded value is needed. +; +; SCOP: p0: (%N * %M) +; SCOP: p1: (zext i32 (%tmp4 /u 2) to i64) +; +; CHECK: polly.preload.merge: +; CHECK: %polly.preload.tmp4.merge = phi i32 [ %polly.access.A.load, %polly.preload.exec ], [ 0, %polly.preload.cond ] +; CHECK: %3 = lshr i32 %polly.preload.tmp4.merge, 1 +; CHECK: %4 = zext i32 %3 to i64 +; +; void f(int *restrict A, int *restrict B, int N, int M) { +; +; for (int i = 0; i < N * M; i++) +; for (int j = 0; j < A[N * M] / 2; j++) +; B[i + j]++; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* noalias %A, i32* noalias %B, i32 %N, i32 %M) { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc.8, %entry + %indvars.iv2 = phi i64 [ %indvars.iv.next3, %for.inc.8 ], [ 0, %entry ] + %mul = mul nsw i32 %N, %M + %tmp = sext i32 %mul to i64 + %cmp = icmp slt i64 %indvars.iv2, %tmp + br i1 %cmp, label %for.body, label %for.end.10 + +for.body: ; preds = %for.cond + br label %for.cond.1 + +for.cond.1: ; preds = %for.inc, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.body ] + %mul2 = mul nsw i32 %N, %M + %idxprom = sext i32 %mul2 to i64 + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %idxprom + %tmp4 = load i32, i32* %arrayidx, align 4 + %div = udiv i32 %tmp4, 2 + %tmp5 = sext i32 %div to i64 + %cmp3 = icmp slt i64 %indvars.iv, %tmp5 + br i1 %cmp3, label %for.body.4, label %for.end + +for.body.4: ; preds = %for.cond.1 + %tmp6 = add nsw i64 %indvars.iv2, %indvars.iv + %arrayidx6 = getelementptr inbounds i32, i32* %B, i64 %tmp6 + %tmp7 = load i32, i32* %arrayidx6, align 4 + %inc = add nsw i32 %tmp7, 1 + store i32 %inc, i32* %arrayidx6, align 4 + br label %for.inc + +for.inc: ; preds = %for.body.4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.cond.1 + +for.end: ; preds = %for.cond.1 + br label %for.inc.8 + +for.inc.8: ; preds = %for.end + %indvars.iv.next3 = add nuw nsw i64 %indvars.iv2, 1 + br label %for.cond + +for.end.10: ; preds = %for.cond + ret void +} Index: test/Isl/CodeGen/invariant_load_ptr_ptr_noalias.ll =================================================================== --- /dev/null +++ test/Isl/CodeGen/invariant_load_ptr_ptr_noalias.ll @@ -0,0 +1,44 @@ +; RUN: opt %loadPolly -polly-detect-unprofitable -polly-codegen -polly-ignore-aliasing -S -polly-no-early-exit < %s | FileCheck %s +; +; CHECK-LABEL: polly.preload.begin: +; CHECK: %polly.access.A = getelementptr i32**, i32*** %A, i64 42 +; CHECK: %polly.access.A.load = load i32**, i32*** %polly.access.A +; CHECK: %polly.access.polly.access.A.load = getelementptr i32*, i32** %polly.access.A.load, i64 32 +; CHECK: %polly.access.polly.access.A.load.load = load i32*, i32** %polly.access.polly.access.A.load +; +; CHECK: polly.stmt.bb2: +; CHECK: %p_tmp6 = getelementptr inbounds i32, i32* %polly.access.polly.access.A.load.load, i64 %polly.indvar +; CHECK: store i32 0, i32* %p_tmp6, align 4 +; +; void f(int ***A) { +; for (int i = 0; i < 1024; i++) +; A[42][32][i] = 0; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32*** %A) { +bb: + br label %bb1 + +bb1: ; preds = %bb7, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb7 ], [ 0, %bb ] + %exitcond = icmp ne i64 %indvars.iv, 1024 + br i1 %exitcond, label %bb2, label %bb8 + +bb2: ; preds = %bb1 + %tmp = getelementptr inbounds i32**, i32*** %A, i64 42 + %tmp3 = load i32**, i32*** %tmp, align 8 + %tmp4 = getelementptr inbounds i32*, i32** %tmp3, i64 32 + %tmp5 = load i32*, i32** %tmp4, align 8 + %tmp6 = getelementptr inbounds i32, i32* %tmp5, i64 %indvars.iv + store i32 0, i32* %tmp6, align 4 + br label %bb7 + +bb7: ; preds = %bb2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb1 + +bb8: ; preds = %bb1 + ret void +} Index: test/Isl/CodeGen/invariant_load_scalar_dep.ll =================================================================== --- /dev/null +++ test/Isl/CodeGen/invariant_load_scalar_dep.ll @@ -0,0 +1,44 @@ +; RUN: opt %loadPolly -polly-no-early-exit -polly-codegen -polly-ignore-aliasing -polly-detect-unprofitable -S < %s | FileCheck %s +; +; CHECK-LABEL: polly.preload.begin: +; CHECK: %polly.access.B = getelementptr i32, i32* %B, i64 0 +; CHECK: %polly.access.B.load = load i32, i32* %polly.access.B +; +; CHECK-LABEL: polly.stmt.bb2.split: +; CHECK: %scevgep = getelementptr i32, i32* %A, i64 %polly.indvar +; CHECK: store i32 %polly.access.B.load, i32* %scevgep, align 4 +; +; void f(int *restrict A, int *restrict B) { +; for (int i = 0; i < 1024; i++) +; auto tmp = *B; +; // Split BB +; A[i] = tmp; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* noalias %A, i32* noalias %B) { +bb: + br label %bb1 + +bb1: ; preds = %bb4, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb4 ], [ 0, %bb ] + %exitcond = icmp ne i64 %indvars.iv, 1024 + br i1 %exitcond, label %bb2, label %bb5 + +bb2: ; preds = %bb1 + %tmp = load i32, i32* %B, align 4 + br label %bb2.split + +bb2.split: + %tmp3 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 %tmp, i32* %tmp3, align 4 + br label %bb4 + +bb4: ; preds = %bb2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb1 + +bb5: ; preds = %bb1 + ret void +} Index: test/Isl/CodeGen/reduction_2.ll =================================================================== --- test/Isl/CodeGen/reduction_2.ll +++ test/Isl/CodeGen/reduction_2.ll @@ -88,5 +88,15 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind -; CHECK: for (int c0 = 0; c0 <= 1018; c0 += 1) -; CHECK: Stmt_for_body(c0); +; Negative test. At the moment we will optimistically assume RED[0] in the conditional after the +; loop might be invariant and expand the SCoP from the loop to include the conditional. However, +; during SCoP generation we will realize that RED[0] is in fact not invariant and bail. +; +; Possible solutions could be: +; - Do not optimistically assume it to be invariant (as before this commit), however we would loose +; a lot of invariant cases due to possible aliasing. +; - Reduce the size of the SCoP if an assumed invariant access is in fact not invariant instead of +; rejecting the whole region. +; +; CHECK-NOT: for (int c0 = 0; c0 <= 1018; c0 += 1) +; CHECK-NOT: Stmt_for_body(c0); Index: test/Isl/CodeGen/whole-scop-non-affine-subregion.ll =================================================================== --- test/Isl/CodeGen/whole-scop-non-affine-subregion.ll +++ test/Isl/CodeGen/whole-scop-non-affine-subregion.ll @@ -2,9 +2,9 @@ ; RUN: -polly-codegen -S < %s | FileCheck %s ; CHECK: polly.start - +; int /* pure */ g() ; void f(int *A) { -; if (*A > 42) +; if (g()) ; *A = *A + 1; ; else ; *A = *A - 1; @@ -17,22 +17,26 @@ br label %entry.split entry.split: - %tmp = load i32, i32* %A, align 4 - %cmp = icmp sgt i32 %tmp, 42 + %call = call i32 @g() + %cmp = icmp eq i32 %call, 0 br i1 %cmp, label %if.then, label %if.else if.then: ; preds = %entry %tmp1 = load i32, i32* %A, align 4 %add = add nsw i32 %tmp1, 1 + store i32 %add, i32* %A, align 4 br label %if.end if.else: ; preds = %entry %tmp2 = load i32, i32* %A, align 4 %sub = add nsw i32 %tmp2, -1 + store i32 %sub, i32* %A, align 4 br label %if.end if.end: ; preds = %if.else, %if.then - %storemerge = phi i32 [ %sub, %if.else ], [ %add, %if.then ] - store i32 %storemerge, i32* %A, align 4 ret void } + +declare i32 @g() #0 + +attributes #0 = { nounwind readnone } Index: test/ScopDetect/base_pointer.ll =================================================================== --- test/ScopDetect/base_pointer.ll +++ test/ScopDetect/base_pointer.ll @@ -29,7 +29,7 @@ } ; CHECK-LABEL: base_pointer_in_condition -; CHECK: Valid Region for Scop: for.i => then +; CHECK: Valid Region for Scop: pre => return define void @base_pointer_is_argument(float* %A, i64 %n) { entry: @@ -292,4 +292,4 @@ } ; CHECK: base_pointer_is_ptr2ptr -; CHECK-NOT: Valid Region for Scop +; CHECK: Valid Region for Scop: for.j => for.i.inc Index: test/ScopDetectionDiagnostics/ReportLoopBound-01.ll =================================================================== --- test/ScopDetectionDiagnostics/ReportLoopBound-01.ll +++ test/ScopDetectionDiagnostics/ReportLoopBound-01.ll @@ -3,7 +3,7 @@ ; RUN: opt %loadPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures -polly-allow-nonaffine-loops=true -polly-allow-nonaffine -polly-detect -analyze < %s 2>&1| FileCheck %s --check-prefix=ALLOWNONAFFINEALL ; void f(int A[], int n) { -; for (int i = 0; i < A[n]; i++) +; for (int i = 0; i < A[n+i]; i++) ; A[i] = 0; ; } @@ -52,7 +52,8 @@ %inc = trunc i64 %1 to i32, !dbg !21 store i32 0, i32* %arrayidx2, align 4, !dbg !24 tail call void @llvm.dbg.value(metadata !{null}, i64 0, metadata !18, metadata !DIExpression()), !dbg !20 - %2 = load i32, i32* %arrayidx, align 4, !dbg !21 + %arrayidx3 = getelementptr inbounds i32, i32* %arrayidx, i64 %indvar, !dbg !21 + %2 = load i32, i32* %arrayidx3, align 4, !dbg !21 %cmp = icmp slt i32 %inc, %2, !dbg !21 %indvar.next = add i64 %indvar, 1, !dbg !21 br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge, !dbg !21 Index: test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll =================================================================== --- test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll +++ test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll @@ -6,7 +6,7 @@ ; ; void a(struct b *A) { ; for (int i=0; i<32; i++) -; A->b[i] = 0; +; A[i].b[i] = 0; ; } ; CHECK: remark: ReportVariantBasePtr01.c:6:8: The following errors keep this region from being a Scop. @@ -23,11 +23,11 @@ entry.split: ; preds = %entry tail call void @llvm.dbg.value(metadata %struct.b* %A, i64 0, metadata !16, metadata !DIExpression()), !dbg !23 tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !17, metadata !DIExpression()), !dbg !25 - %b = getelementptr inbounds %struct.b, %struct.b* %A, i64 0, i32 0, !dbg !26 br label %for.body, !dbg !27 for.body: ; preds = %for.body, %entry.split %indvar4 = phi i64 [ %indvar.next, %for.body ], [ 0, %entry.split ] + %b = getelementptr inbounds %struct.b, %struct.b* %A, i64 %indvar4, i32 0, !dbg !26 %0 = mul i64 %indvar4, 4, !dbg !26 %1 = add i64 %0, 3, !dbg !26 %2 = add i64 %0, 2, !dbg !26 Index: test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_1.ll =================================================================== --- test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_1.ll +++ test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_1.ll @@ -31,8 +31,10 @@ ; void f(int * restrict A, int * restrict C) { ; int j; ; for (int i = 0; i < 1024; i++) { -; while ((j = C[i])) +; while ((j = C[i++])) { ; A[j]++; +; if (true) break; +; } ; } ; } ; @@ -62,7 +64,7 @@ %tmp9 = load i32, i32* %tmp8, align 4 %tmp10 = add nsw i32 %tmp9, 1 store i32 %tmp10, i32* %tmp8, align 4 - br label %bb3 + br i1 true, label %bb11, label %bb3 bb11: ; preds = %bb3 br label %bb12 Index: test/ScopInfo/NonAffine/non_affine_conditional_surrounding_affine_loop.ll =================================================================== --- test/ScopInfo/NonAffine/non_affine_conditional_surrounding_affine_loop.ll +++ test/ScopInfo/NonAffine/non_affine_conditional_surrounding_affine_loop.ll @@ -6,35 +6,19 @@ ; RUN: -polly-detect-unprofitable -analyze < %s | FileCheck %s \ ; RUN: --check-prefix=ALL ; -; INNERMOST: Function: f -; INNERMOST: Region: %bb9---%bb17 -; INNERMOST: Max Loop Depth: 1 -; INNERMOST: Context: -; INNERMOST: [N] -> { : -; INNERMOST-DAG: N >= -2147483648 -; INNERMOST-DAG: and -; INNERMOST-DAG: N <= 2147483647 -; INNERMOST } -; INNERMOST: Assumed Context: -; INNERMOST: [N] -> { : } -; INNERMOST: p0: %N -; INNERMOST: Alias Groups (0): -; INNERMOST: n/a -; INNERMOST: Statements { -; INNERMOST: Stmt_bb11 -; INNERMOST: Domain := -; INNERMOST: [N] -> { Stmt_bb11[i0] : -; INNERMOST-DAG: i0 >= 0 -; INNERMOST-DAG: and -; INNERMOST-DAG: i0 <= -1 + N -; INNERMOST: } -; INNERMOST: Schedule := -; INNERMOST: [N] -> { Stmt_bb11[i0] -> [i0] }; -; INNERMOST: ReadAccess := [Reduction Type: +] [Scalar: 0] -; INNERMOST: [N] -> { Stmt_bb11[i0] -> MemRef_A[i0] }; -; INNERMOST: MustWriteAccess := [Reduction Type: +] [Scalar: 0] -; INNERMOST: [N] -> { Stmt_bb11[i0] -> MemRef_A[i0] }; -; INNERMOST: } +; Negative test for INNERMOST. +; At the moment we will optimistically assume A[i] in the conditional before the inner +; loop might be invariant and expand the SCoP from the loop to include the conditional. However, +; during SCoP generation we will realize that A[i] is in fact not invariant (in this region = the body +; of the outer loop) and bail. +; +; Possible solutions could be: +; - Do not optimistically assume it to be invariant (as before this commit), however we would loose +; a lot of invariant cases due to possible aliasing. +; - Reduce the size of the SCoP if an assumed invariant access is in fact not invariant instead of +; rejecting the whole region. +; +; INNERMOST-NOT: Function: f ; ; ALL: Function: f ; ALL: Region: %bb3---%bb19 Index: test/ScopInfo/NonAffine/non_affine_conditional_surrounding_non_affine_loop.ll =================================================================== --- test/ScopInfo/NonAffine/non_affine_conditional_surrounding_non_affine_loop.ll +++ test/ScopInfo/NonAffine/non_affine_conditional_surrounding_non_affine_loop.ll @@ -5,35 +5,19 @@ ; RUN: -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true \ ; RUN: -analyze < %s | FileCheck %s --check-prefix=ALL ; -; INNERMOST: Function: f -; INNERMOST: Region: %bb9---%bb18 -; INNERMOST: Max Loop Depth: 1 -; INNERMOST: Context: -; INNERMOST: [p_0] -> { : -; INNERMOST-DAG: p_0 >= -2199023255552 -; INNERMOST-DAG: and -; INNERMOST-DAG: p_0 <= 2199023254528 -; INNERMOST: } -; INNERMOST: Assumed Context: -; INNERMOST: [p_0] -> { : } -; INNERMOST: p0: {0,+,(sext i32 %N to i64)}<%bb3> -; INNERMOST: Alias Groups (0): -; INNERMOST: n/a -; INNERMOST: Statements { -; INNERMOST: Stmt_bb12 -; INNERMOST: Domain := -; INNERMOST: [p_0] -> { Stmt_bb12[i0] : -; INNERMOST-DAG: i0 >= 0 -; INNERMOST-DAG: and -; INNERMOST-DAG: i0 <= -1 + p_0 -; INNERMOST: } -; INNERMOST: Schedule := -; INNERMOST: [p_0] -> { Stmt_bb12[i0] -> [i0] }; -; INNERMOST: ReadAccess := [Reduction Type: +] [Scalar: 0] -; INNERMOST: [p_0] -> { Stmt_bb12[i0] -> MemRef_A[i0] }; -; INNERMOST: MustWriteAccess := [Reduction Type: +] [Scalar: 0] -; INNERMOST: [p_0] -> { Stmt_bb12[i0] -> MemRef_A[i0] }; -; INNERMOST: } +; Negative test for INNERMOST. +; At the moment we will optimistically assume A[i] in the conditional before the inner +; loop might be invariant and expand the SCoP from the loop to include the conditional. However, +; during SCoP generation we will realize that A[i] is in fact not invariant (in this region = the body +; of the outer loop) and bail. +; +; Possible solutions could be: +; - Do not optimistically assume it to be invariant (as before this commit), however we would loose +; a lot of invariant cases due to possible aliasing. +; - Reduce the size of the SCoP if an assumed invariant access is in fact not invariant instead of +; rejecting the whole region. +; +; INNERMOST-NOT: Function: f ; ; ALL: Function: f ; ALL: Region: %bb3---%bb20 Index: test/ScopInfo/intra_and_inter_bb_scalar_dep.ll =================================================================== --- test/ScopInfo/intra_and_inter_bb_scalar_dep.ll +++ test/ScopInfo/intra_and_inter_bb_scalar_dep.ll @@ -17,8 +17,8 @@ ; CHECK: Invariant Accesses: { ; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 0] ; CHECK: MemRef_init_ptr[0] -; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; CHECK: MemRef_init_ptr[0] +; CHECK-NOT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NOT: MemRef_init_ptr[0] ; CHECK: } define void @f(i64* noalias %A, i64 %N, i64* noalias %init_ptr) #0 { entry: Index: test/ScopInfo/invariant_load_base_pointer.ll =================================================================== --- /dev/null +++ test/ScopInfo/invariant_load_base_pointer.ll @@ -0,0 +1,35 @@ +; RUN: opt %loadPolly -polly-scops -polly-ignore-aliasing -polly-detect-unprofitable -analyze < %s | FileCheck %s +; +; CHECK: Invariant Accesses: +; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: { Stmt_bb2[i0] -> MemRef_BPLoc[0] }; +; +; void f(int **BPLoc) { +; for (int i = 0; i < 1024; i++) +; (*BPLoc)[i] = 0; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32** %BPLoc) { +bb: + br label %bb1 + +bb1: ; preds = %bb4, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb4 ], [ 0, %bb ] + %exitcond = icmp ne i64 %indvars.iv, 1024 + br i1 %exitcond, label %bb2, label %bb5 + +bb2: ; preds = %bb1 + %tmp = load i32*, i32** %BPLoc, align 8 + %tmp3 = getelementptr inbounds i32, i32* %tmp, i64 %indvars.iv + store i32 0, i32* %tmp3, align 4 + br label %bb4 + +bb4: ; preds = %bb2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb1 + +bb5: ; preds = %bb1 + ret void +} Index: test/ScopInfo/invariant_load_base_pointer_conditional.ll =================================================================== --- /dev/null +++ test/ScopInfo/invariant_load_base_pointer_conditional.ll @@ -0,0 +1,51 @@ +; RUN: opt %loadPolly -polly-scops -polly-ignore-aliasing -polly-detect-unprofitable -analyze < %s | FileCheck %s +; +; CHECK: Invariant Accesses: +; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: [N] -> { Stmt_bb5[i0] -> MemRef_BPLoc[0] }; +; +; void f(int **BPLoc, int *A, int N) { +; for (int i = 0; i < N; i++) +; if (i > 512) +; (*BPLoc)[i] = 0; +; else +; A[i] = 0; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32** %BPLoc, i32* %A, i32 %N) { +bb: + %tmp = sext i32 %N to i64 + br label %bb1 + +bb1: ; preds = %bb11, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb11 ], [ 0, %bb ] + %tmp2 = icmp slt i64 %indvars.iv, %tmp + br i1 %tmp2, label %bb3, label %bb12 + +bb3: ; preds = %bb1 + %tmp4 = icmp sgt i64 %indvars.iv, 512 + br i1 %tmp4, label %bb5, label %bb8 + +bb5: ; preds = %bb3 + %tmp6 = load i32*, i32** %BPLoc, align 8 + %tmp7 = getelementptr inbounds i32, i32* %tmp6, i64 %indvars.iv + store i32 0, i32* %tmp7, align 4 + br label %bb10 + +bb8: ; preds = %bb3 + %tmp9 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 0, i32* %tmp9, align 4 + br label %bb10 + +bb10: ; preds = %bb8, %bb5 + br label %bb11 + +bb11: ; preds = %bb10 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb1 + +bb12: ; preds = %bb1 + ret void +} Index: test/ScopInfo/invariant_load_condition.ll =================================================================== --- /dev/null +++ test/ScopInfo/invariant_load_condition.ll @@ -0,0 +1,43 @@ +; RUN: opt %loadPolly -polly-detect-unprofitable -polly-scops -analyze < %s | FileCheck %s +; +; CHECK: Invariant Accesses: +; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: { Stmt_bb2[i0] -> MemRef_C[0] }; +; +; void f(int *A, int *C) { +; for (int i = 0; i < 1024; i++) +; if (*C) +; A[i] = 0; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* %A, i32* %C) { +bb: + br label %bb1 + +bb1: ; preds = %bb7, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb7 ], [ 0, %bb ] + %exitcond = icmp ne i64 %indvars.iv, 1024 + br i1 %exitcond, label %bb2, label %bb8 + +bb2: ; preds = %bb1 + %tmp = load i32, i32* %C, align 4 + %tmp3 = icmp eq i32 %tmp, 0 + br i1 %tmp3, label %bb6, label %bb4 + +bb4: ; preds = %bb2 + %tmp5 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 0, i32* %tmp5, align 4 + br label %bb6 + +bb6: ; preds = %bb2, %bb4 + br label %bb7 + +bb7: ; preds = %bb6 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb1 + +bb8: ; preds = %bb1 + ret void +} Index: test/ScopInfo/invariant_load_loop_ub.ll =================================================================== --- /dev/null +++ test/ScopInfo/invariant_load_loop_ub.ll @@ -0,0 +1,36 @@ +; RUN: opt %loadPolly -polly-scops -polly-detect-unprofitable -analyze < %s | FileCheck %s +; +; CHECK: Invariant Accesses: +; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: { Stmt_bb1[i0] -> MemRef_UB[0] }; +; +; void f(int *A, int *UB) { +; for (int i = 0; i < *UB; i++) +; A[i] = 0; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* %A, i32* %UB) { +bb: + br label %bb1 + +bb1: ; preds = %bb6, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb6 ], [ 0, %bb ] + %tmp = load i32, i32* %UB, align 4 + %tmp2 = sext i32 %tmp to i64 + %tmp3 = icmp slt i64 %indvars.iv, %tmp2 + br i1 %tmp3, label %bb4, label %bb7 + +bb4: ; preds = %bb1 + %tmp5 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 0, i32* %tmp5, align 4 + br label %bb6 + +bb6: ; preds = %bb4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb1 + +bb7: ; preds = %bb1 + ret void +} Index: test/ScopInfo/invariant_load_ptr_ptr_noalias.ll =================================================================== --- test/ScopInfo/invariant_load_ptr_ptr_noalias.ll +++ test/ScopInfo/invariant_load_ptr_ptr_noalias.ll @@ -1,6 +1,15 @@ ; RUN: opt %loadPolly -tbaa -polly-scops -polly-ignore-aliasing \ ; RUN: -polly-detect-unprofitable -analyze < %s | FileCheck %s ; +; Note: The order of the invariant accesses is important! +; +; CHECK: Invariant Accesses: { +; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK: MemRef_A[42] +; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK: MemRef_tmp3[32] +; CHECK: } +; ; CHECK: Arrays { ; CHECK: i32** MemRef_A[*][8] ; CHECK: i32* MemRef_tmp3[*][8] [BasePtrOrigin: MemRef_A] @@ -31,11 +40,11 @@ bb2: ; preds = %bb1 %tmp = getelementptr inbounds i32**, i32*** %A, i64 42 - %tmp3 = load i32**, i32*** %tmp, align 8, !tbaa !1 + %tmp3 = load i32**, i32*** %tmp, align 8 %tmp4 = getelementptr inbounds i32*, i32** %tmp3, i64 32 - %tmp5 = load i32*, i32** %tmp4, align 8, !tbaa !1 + %tmp5 = load i32*, i32** %tmp4, align 8 %tmp6 = getelementptr inbounds i32, i32* %tmp5, i64 %indvars.iv - store i32 0, i32* %tmp6, align 4, !tbaa !5 + store i32 0, i32* %tmp6, align 4 br label %bb7 bb7: ; preds = %bb2 @@ -45,11 +54,3 @@ bb8: ; preds = %bb1 ret void } - -!0 = !{!"clang version 3.8.0 (http://llvm.org/git/clang.git 9e282ff441e7a367dc711e41fd19d27ffc0f78d6)"} -!1 = !{!2, !2, i64 0} -!2 = !{!"any pointer", !3, i64 0} -!3 = !{!"omnipotent char", !4, i64 0} -!4 = !{!"Simple C/C++ TBAA"} -!5 = !{!6, !6, i64 0} -!6 = !{!"int", !3, i64 0} Index: test/ScopInfo/invariant_load_scalar_dep.ll =================================================================== --- /dev/null +++ test/ScopInfo/invariant_load_scalar_dep.ll @@ -0,0 +1,42 @@ +; RUN: opt %loadPolly -polly-detect-unprofitable -polly-scops -analyze < %s | FileCheck %s +; +; CHECK: Invariant Accesses: +; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: { Stmt_bb2[i0] -> MemRef_B[0] }; +; CHECK-NOT: MustWriteAccess := [Reduction Type: NONE] [Scalar: 1] +; CHECK-NOT: { Stmt_bb2[i0] -> MemRef_tmp[] }; +; +; void f(int *restrict A, int *restrict B) { +; for (int i = 0; i < 1024; i++) +; auto tmp = *B; +; // Split BB +; A[i] = tmp; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* noalias %A, i32* noalias %B) { +bb: + br label %bb1 + +bb1: ; preds = %bb4, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb4 ], [ 0, %bb ] + %exitcond = icmp ne i64 %indvars.iv, 1024 + br i1 %exitcond, label %bb2, label %bb5 + +bb2: ; preds = %bb1 + %tmp = load i32, i32* %B, align 4 + br label %bb2b + +bb2b: + %tmp3 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 %tmp, i32* %tmp3, align 4 + br label %bb4 + +bb4: ; preds = %bb2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb1 + +bb5: ; preds = %bb1 + ret void +} Index: test/ScopInfo/invariant_loads_complicated_dependences.ll =================================================================== --- /dev/null +++ test/ScopInfo/invariant_loads_complicated_dependences.ll @@ -0,0 +1,85 @@ +; RUN: opt %loadPolly -polly-scops -analyze < %s | FileCheck %s +; +; CHECK: Invariant Accesses: { +; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: [tmp, tmp5] -> { Stmt_for_body[i0] -> MemRef_LB[0] }; +; CHECK-NEXT: Execution Context: [tmp, tmp5] -> { : } +; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: [tmp, tmp5] -> { Stmt_do_cond[i0, i1] -> MemRef_UB[0] }; +; CHECK-NEXT: Execution Context: [tmp, tmp5] -> { : } +; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: [tmp, tmp5] -> { Stmt_if_else[i0, i1] -> MemRef_U[0] }; +; CHECK-NEXT: Execution Context: [tmp, tmp5] -> { : tmp <= 5 } +; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: [tmp, tmp5] -> { Stmt_if_then[i0, i1] -> MemRef_V[0] }; +; CHECK-NEXT: Execution Context: [tmp, tmp5] -> { : (tmp5 >= 1 + tmp and tmp5 >= 6) or tmp >= 6 } +; CHECK-NEXT: } +; +; void f(int *restrict A, int *restrict V, int *restrict U, int *restrict UB, +; int *restrict LB) { +; for (int i = 0; i < 100; i++) { +; int j = /* invariant load */ *LB; +; do { +; if (j > 5) +; A[i] += /* invariant load */ *V; +; else +; A[i] += /* invariant load */ *U; +; } while (j++ < /* invariant load */ *UB); +; } +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* noalias %A, i32* noalias %V, i32* noalias %U, i32* noalias %UB, i32* noalias %LB) { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ] + %exitcond = icmp ne i64 %indvars.iv, 100 + br i1 %exitcond, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %tmp = load i32, i32* %LB, align 4 + br label %do.body + +do.body: ; preds = %do.cond, %for.body + %j.0 = phi i32 [ %tmp, %for.body ], [ %inc, %do.cond ] + %cmp1 = icmp sgt i32 %j.0, 5 + br i1 %cmp1, label %if.then, label %if.else + +if.then: ; preds = %do.body + %tmp1 = load i32, i32* %V, align 4 + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp2 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %tmp2, %tmp1 + store i32 %add, i32* %arrayidx, align 4 + br label %if.end + +if.else: ; preds = %do.body + %tmp3 = load i32, i32* %U, align 4 + %arrayidx3 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp4 = load i32, i32* %arrayidx3, align 4 + %add4 = add nsw i32 %tmp4, %tmp3 + store i32 %add4, i32* %arrayidx3, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + br label %do.cond + +do.cond: ; preds = %if.end + %inc = add nsw i32 %j.0, 1 + %tmp5 = load i32, i32* %UB, align 4 + %cmp5 = icmp slt i32 %j.0, %tmp5 + br i1 %cmp5, label %do.body, label %do.end + +do.end: ; preds = %do.cond + br label %for.inc + +for.inc: ; preds = %do.end + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} Index: test/ScopInfo/invariant_loads_cyclic_dependences.ll =================================================================== --- /dev/null +++ test/ScopInfo/invariant_loads_cyclic_dependences.ll @@ -0,0 +1,63 @@ +; RUN: opt %loadPolly -analyze -polly-scops < %s | FileCheck %s +; +; Negative test. If we assume UB[*V] to be invariant we get a cyclic +; dependence in the invariant loads that needs to be resolved by +; ignoring the actual accessed address and focusing on the fact +; that the access happened. However, at the moment we assume UB[*V] +; not to be loop invariant, thus reject this region. +; +; CHECK-NOT: Statements +; +; +; void f(int *restrict V, int *restrict UB, int *restrict A) { +; for (int i = 0; i < 100; i++) { +; int j = 0; +; int x = 0; +; do { +; x = /* invariant load dependent on UB[*V] */ *V; +; A[j + i]++; +; } while (j++ < /* invariant load dependent on *V */ UB[x]); +; } +; } +; +target datalayout = "e-m:e-i32:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* noalias %V, i32* noalias %UB, i32* noalias %A) { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %indvars.iv2 = phi i32 [ %indvars.iv.next3, %for.inc ], [ 0, %entry ] + %exitcond = icmp ne i32 %indvars.iv2, 100 + br i1 %exitcond, label %for.body, label %for.end + +for.body: ; preds = %for.cond + br label %do.body + +do.body: ; preds = %do.cond, %for.body + %indvars.iv = phi i32 [ %indvars.iv.next, %do.cond ], [ 0, %for.body ] + %tmp = load i32, i32* %V, align 4 + %tmp4 = add nuw nsw i32 %indvars.iv, %indvars.iv2 + %arrayidx = getelementptr inbounds i32, i32* %A, i32 %tmp4 + %tmp5 = load i32, i32* %arrayidx, align 4 + %inc = add nsw i32 %tmp5, 1 + store i32 %inc, i32* %arrayidx, align 4 + br label %do.cond + +do.cond: ; preds = %do.body + %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1 + %arrayidx3 = getelementptr inbounds i32, i32* %UB, i32 %tmp + %tmp6 = load i32, i32* %arrayidx3, align 4 + %cmp4 = icmp slt i32 %indvars.iv, %tmp6 + br i1 %cmp4, label %do.body, label %do.end + +do.end: ; preds = %do.cond + br label %for.inc + +for.inc: ; preds = %do.end + %indvars.iv.next3 = add nuw nsw i32 %indvars.iv2, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} Index: test/ScopInfo/invariant_loop_bounds.ll =================================================================== --- /dev/null +++ test/ScopInfo/invariant_loop_bounds.ll @@ -0,0 +1,108 @@ +; RUN: opt %loadPolly -polly-scops -analyze < %s | FileCheck %s +; +; CHECK: Invariant Accesses: { +; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: MemRef_bounds[2] +; CHECK-NEXT: Execution Context: [tmp, tmp8, tmp10] -> { : } +; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: MemRef_bounds[1] +; CHECK-NEXT: Execution Context: [tmp, tmp8, tmp10] -> { : tmp >= 1 } +; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: MemRef_bounds[0] +; CHECK-NEXT: Execution Context: [tmp, tmp8, tmp10] -> { : tmp8 >= 1 and tmp >= 1 } +; CHECK-NEXT: } +; +; CHECK: p0: %tmp +; CHECK: p1: %tmp8 +; CHECK: p2: %tmp10 +; CHECK: Statements { +; CHECK: Stmt_for_body_6 +; CHECK: Domain := +; CHECK: [tmp, tmp8, tmp10] -> { Stmt_for_body_6[i0, i1, i2] : i0 >= 0 and i0 <= -1 + tmp and i1 >= 0 and i1 <= -1 + tmp8 and i2 >= 0 and i2 <= -1 + tmp10 }; +; CHECK: Schedule := +; CHECK: [tmp, tmp8, tmp10] -> { Stmt_for_body_6[i0, i1, i2] -> [i0, i1, i2] }; +; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK: [tmp, tmp8, tmp10] -> { Stmt_for_body_6[i0, i1, i2] -> MemRef_data[i0, i1, i2] }; +; CHECK: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK: [tmp, tmp8, tmp10] -> { Stmt_for_body_6[i0, i1, i2] -> MemRef_data[i0, i1, i2] }; +; CHECK: } +; +; int bounds[3]; +; double data[1024][1024][1024]; +; +; void foo() { +; int i, j, k; +; for (k = 0; k < bounds[2]; k++) +; for (j = 0; j < bounds[1]; j++) +; for (i = 0; i < bounds[0]; i++) +; data[k][j][i] += i + j + k; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +@bounds = common global [3 x i32] zeroinitializer, align 4 +@data = common global [1024 x [1024 x [1024 x double]]] zeroinitializer, align 16 + +define void @foo() { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc.16, %entry + %indvars.iv5 = phi i64 [ %indvars.iv.next6, %for.inc.16 ], [ 0, %entry ] + %tmp = load i32, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @bounds, i64 0, i64 2), align 4 + %tmp7 = sext i32 %tmp to i64 + %cmp = icmp slt i64 %indvars.iv5, %tmp7 + br i1 %cmp, label %for.body, label %for.end.18 + +for.body: ; preds = %for.cond + br label %for.cond.1 + +for.cond.1: ; preds = %for.inc.13, %for.body + %indvars.iv3 = phi i64 [ %indvars.iv.next4, %for.inc.13 ], [ 0, %for.body ] + %tmp8 = load i32, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @bounds, i64 0, i64 1), align 4 + %tmp9 = sext i32 %tmp8 to i64 + %cmp2 = icmp slt i64 %indvars.iv3, %tmp9 + br i1 %cmp2, label %for.body.3, label %for.end.15 + +for.body.3: ; preds = %for.cond.1 + br label %for.cond.4 + +for.cond.4: ; preds = %for.inc, %for.body.3 + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.body.3 ] + %tmp10 = load i32, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @bounds, i64 0, i64 0), align 4 + %tmp11 = sext i32 %tmp10 to i64 + %cmp5 = icmp slt i64 %indvars.iv, %tmp11 + br i1 %cmp5, label %for.body.6, label %for.end + +for.body.6: ; preds = %for.cond.4 + %tmp12 = add nsw i64 %indvars.iv, %indvars.iv3 + %tmp13 = add nsw i64 %tmp12, %indvars.iv5 + %tmp14 = trunc i64 %tmp13 to i32 + %conv = sitofp i32 %tmp14 to double + %arrayidx11 = getelementptr inbounds [1024 x [1024 x [1024 x double]]], [1024 x [1024 x [1024 x double]]]* @data, i64 0, i64 %indvars.iv5, i64 %indvars.iv3, i64 %indvars.iv + %tmp15 = load double, double* %arrayidx11, align 8 + %add12 = fadd double %tmp15, %conv + store double %add12, double* %arrayidx11, align 8 + br label %for.inc + +for.inc: ; preds = %for.body.6 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.cond.4 + +for.end: ; preds = %for.cond.4 + br label %for.inc.13 + +for.inc.13: ; preds = %for.end + %indvars.iv.next4 = add nuw nsw i64 %indvars.iv3, 1 + br label %for.cond.1 + +for.end.15: ; preds = %for.cond.1 + br label %for.inc.16 + +for.inc.16: ; preds = %for.end.15 + %indvars.iv.next6 = add nuw nsw i64 %indvars.iv5, 1 + br label %for.cond + +for.end.18: ; preds = %for.cond + ret void +} Index: test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll =================================================================== --- /dev/null +++ test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll @@ -0,0 +1,106 @@ +; RUN: opt %loadPolly -polly-scops -analyze < %s | FileCheck %s +; +; Verify that we only have one parameter and one invariant load for all +; three loads that occure in the region but actually access the same +; location. Also check that the execution context is the most generic +; one, e.g., here the universal set. +; +; CHECK: Invariant Accesses: { +; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: MemRef_bounds[0] +; CHECK-NEXT: Execution Context: [tmp] -> { : } +; CHECK-NEXT: } +; +; CHECK: p0: %tmp +; CHECK-NOT: p1 +; CHECK: Statements { +; CHECK: Stmt_for_body_6 +; CHECK: Domain := +; CHECK: [tmp] -> { Stmt_for_body_6[i0, i1, i2] : i0 >= 0 and i0 <= -1 + tmp and i1 >= 0 and i1 <= -1 + tmp and i2 >= 0 and i2 <= -1 + tmp }; +; CHECK: Schedule := +; CHECK: [tmp] -> { Stmt_for_body_6[i0, i1, i2] -> [i0, i1, i2] }; +; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK: [tmp] -> { Stmt_for_body_6[i0, i1, i2] -> MemRef_data[i0, i1, i2] }; +; CHECK: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK: [tmp] -> { Stmt_for_body_6[i0, i1, i2] -> MemRef_data[i0, i1, i2] }; +; CHECK: } +; +; int bounds[1]; +; double data[1024][1024][1024]; +; +; void foo() { +; int i, j, k; +; for (k = 0; k < bounds[0]; k++) +; for (j = 0; j < bounds[0]; j++) +; for (i = 0; i < bounds[0]; i++) +; data[k][j][i] += i + j + k; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +@bounds = common global [1 x i32] zeroinitializer, align 4 +@data = common global [1024 x [1024 x [1024 x double]]] zeroinitializer, align 16 + +define void @foo() { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc.16, %entry + %indvars.iv5 = phi i64 [ %indvars.iv.next6, %for.inc.16 ], [ 0, %entry ] + %tmp = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @bounds, i64 0, i64 0), align 4 + %tmp7 = sext i32 %tmp to i64 + %cmp = icmp slt i64 %indvars.iv5, %tmp7 + br i1 %cmp, label %for.body, label %for.end.18 + +for.body: ; preds = %for.cond + br label %for.cond.1 + +for.cond.1: ; preds = %for.inc.13, %for.body + %indvars.iv3 = phi i64 [ %indvars.iv.next4, %for.inc.13 ], [ 0, %for.body ] + %tmp8 = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @bounds, i64 0, i64 0), align 4 + %tmp9 = sext i32 %tmp8 to i64 + %cmp2 = icmp slt i64 %indvars.iv3, %tmp9 + br i1 %cmp2, label %for.body.3, label %for.end.15 + +for.body.3: ; preds = %for.cond.1 + br label %for.cond.4 + +for.cond.4: ; preds = %for.inc, %for.body.3 + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.body.3 ] + %tmp10 = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @bounds, i64 0, i64 0), align 4 + %tmp11 = sext i32 %tmp10 to i64 + %cmp5 = icmp slt i64 %indvars.iv, %tmp11 + br i1 %cmp5, label %for.body.6, label %for.end + +for.body.6: ; preds = %for.cond.4 + %tmp12 = add nsw i64 %indvars.iv, %indvars.iv3 + %tmp13 = add nsw i64 %tmp12, %indvars.iv5 + %tmp14 = trunc i64 %tmp13 to i32 + %conv = sitofp i32 %tmp14 to double + %arrayidx11 = getelementptr inbounds [1024 x [1024 x [1024 x double]]], [1024 x [1024 x [1024 x double]]]* @data, i64 0, i64 %indvars.iv5, i64 %indvars.iv3, i64 %indvars.iv + %tmp15 = load double, double* %arrayidx11, align 8 + %add12 = fadd double %tmp15, %conv + store double %add12, double* %arrayidx11, align 8 + br label %for.inc + +for.inc: ; preds = %for.body.6 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.cond.4 + +for.end: ; preds = %for.cond.4 + br label %for.inc.13 + +for.inc.13: ; preds = %for.end + %indvars.iv.next4 = add nuw nsw i64 %indvars.iv3, 1 + br label %for.cond.1 + +for.end.15: ; preds = %for.cond.1 + br label %for.inc.16 + +for.inc.16: ; preds = %for.end.15 + %indvars.iv.next6 = add nuw nsw i64 %indvars.iv5, 1 + br label %for.cond + +for.end.18: ; preds = %for.cond + ret void +} Index: test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll =================================================================== --- /dev/null +++ test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll @@ -0,0 +1,109 @@ +; RUN: opt %loadPolly -polly-scops -analyze < %s | FileCheck %s +; +; Verify that we only have one parameter and one invariant load for all +; three loads that occure in the region but actually access the same +; location. Also check that the execution context is the most generic +; one, e.g., here the universal set. +; +; CHECK: Invariant Accesses: { +; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: MemRef_bounds[0] +; CHECK-NEXT: Execution Context: [tmp, p] -> { : } +; CHECK-NEXT: } +; +; CHECK: p0: %tmp +; CHECK: p1: %p +; CHECK-NOT: p2: +; CHECK: Statements { +; CHECK: Stmt_for_body_6 +; CHECK: Domain := +; CHECK: [tmp, p] -> { Stmt_for_body_6[i0, i1, i2] : p = 0 and i0 >= 0 and i0 <= -1 + tmp and i1 >= 0 and i1 <= -1 + tmp and i2 >= 0 and i2 <= -1 + tmp }; +; CHECK: Schedule := +; CHECK: [tmp, p] -> { Stmt_for_body_6[i0, i1, i2] -> [i0, i1, i2] }; +; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK: [tmp, p] -> { Stmt_for_body_6[i0, i1, i2] -> MemRef_data[i0, i1, i2] }; +; CHECK: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK: [tmp, p] -> { Stmt_for_body_6[i0, i1, i2] -> MemRef_data[i0, i1, i2] }; +; CHECK: } +; +; int bounds[1]; +; double data[1024][1024][1024]; +; +; void foo(int p) { +; int i, j, k; +; for (k = 0; k < bounds[0]; k++) +; if (p == 0) +; for (j = 0; j < bounds[0]; j++) +; for (i = 0; i < bounds[0]; i++) +; data[k][j][i] += i + j + k; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +@bounds = common global [1 x i32] zeroinitializer, align 4 +@data = common global [1024 x [1024 x [1024 x double]]] zeroinitializer, align 16 + +define void @foo(i32 %p) { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc.16, %entry + %indvars.iv5 = phi i64 [ %indvars.iv.next6, %for.inc.16 ], [ 0, %entry ] + %tmp = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @bounds, i64 0, i64 0), align 4 + %tmp7 = sext i32 %tmp to i64 + %cmp = icmp slt i64 %indvars.iv5, %tmp7 + br i1 %cmp, label %for.body, label %for.end.18 + +for.body: ; preds = %for.cond + %cmpp = icmp eq i32 %p, 0 + br i1 %cmpp, label %for.cond.1, label %for.inc.16 + +for.cond.1: ; preds = %for.inc.13, %for.body + %indvars.iv3 = phi i64 [ %indvars.iv.next4, %for.inc.13 ], [ 0, %for.body ] + %tmp8 = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @bounds, i64 0, i64 0), align 4 + %tmp9 = sext i32 %tmp8 to i64 + %cmp2 = icmp slt i64 %indvars.iv3, %tmp9 + br i1 %cmp2, label %for.body.3, label %for.end.15 + +for.body.3: ; preds = %for.cond.1 + br label %for.cond.4 + +for.cond.4: ; preds = %for.inc, %for.body.3 + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.body.3 ] + %tmp10 = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @bounds, i64 0, i64 0), align 4 + %tmp11 = sext i32 %tmp10 to i64 + %cmp5 = icmp slt i64 %indvars.iv, %tmp11 + br i1 %cmp5, label %for.body.6, label %for.end + +for.body.6: ; preds = %for.cond.4 + %tmp12 = add nsw i64 %indvars.iv, %indvars.iv3 + %tmp13 = add nsw i64 %tmp12, %indvars.iv5 + %tmp14 = trunc i64 %tmp13 to i32 + %conv = sitofp i32 %tmp14 to double + %arrayidx11 = getelementptr inbounds [1024 x [1024 x [1024 x double]]], [1024 x [1024 x [1024 x double]]]* @data, i64 0, i64 %indvars.iv5, i64 %indvars.iv3, i64 %indvars.iv + %tmp15 = load double, double* %arrayidx11, align 8 + %add12 = fadd double %tmp15, %conv + store double %add12, double* %arrayidx11, align 8 + br label %for.inc + +for.inc: ; preds = %for.body.6 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.cond.4 + +for.end: ; preds = %for.cond.4 + br label %for.inc.13 + +for.inc.13: ; preds = %for.end + %indvars.iv.next4 = add nuw nsw i64 %indvars.iv3, 1 + br label %for.cond.1 + +for.end.15: ; preds = %for.cond.1 + br label %for.inc.16 + +for.inc.16: ; preds = %for.end.15 + %indvars.iv.next6 = add nuw nsw i64 %indvars.iv5, 1 + br label %for.cond + +for.end.18: ; preds = %for.cond + ret void +} Index: test/ScopInfo/required-invariant-loop-bounds.ll =================================================================== --- /dev/null +++ test/ScopInfo/required-invariant-loop-bounds.ll @@ -0,0 +1,68 @@ +; RUN: opt %loadPolly -polly-scops -analyze < %s | FileCheck %s +; +; CHECK: Invariant Accesses: { +; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: MemRef_bounds[0] +; CHECK-NEXT: Execution Context: [tmp, tmp1] -> { : } +; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: MemRef_bounds[1] +; CHECK-NEXT: Execution Context: [tmp, tmp1] -> { : tmp >= 0 } +; CHECK: } + +; double A[1000][1000]; +; long bounds[2]; +; +; void foo() { +; +; for (long i = 0; i <= bounds[0]; i++) +; for (long j = 0; j <= bounds[1]; j++) +; A[i][j] += i + j; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +@bounds = common global [2 x i64] zeroinitializer, align 16 +@A = common global [1000 x [1000 x double]] zeroinitializer, align 16 + +define void @foo() { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc.6, %entry + %i.0 = phi i64 [ 0, %entry ], [ %inc7, %for.inc.6 ] + %tmp = load i64, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @bounds, i64 0, i64 0), align 16 + %cmp = icmp sgt i64 %i.0, %tmp + br i1 %cmp, label %for.end.8, label %for.body + +for.body: ; preds = %for.cond + br label %for.cond.1 + +for.cond.1: ; preds = %for.inc, %for.body + %j.0 = phi i64 [ 0, %for.body ], [ %inc, %for.inc ] + %tmp1 = load i64, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @bounds, i64 0, i64 1), align 8 + %cmp2 = icmp sgt i64 %j.0, %tmp1 + br i1 %cmp2, label %for.end, label %for.body.3 + +for.body.3: ; preds = %for.cond.1 + %add = add nsw i64 %i.0, %j.0 + %conv = sitofp i64 %add to double + %arrayidx4 = getelementptr inbounds [1000 x [1000 x double]], [1000 x [1000 x double]]* @A, i64 0, i64 %i.0, i64 %j.0 + %tmp2 = load double, double* %arrayidx4, align 8 + %add5 = fadd double %tmp2, %conv + store double %add5, double* %arrayidx4, align 8 + br label %for.inc + +for.inc: ; preds = %for.body.3 + %inc = add nuw nsw i64 %j.0, 1 + br label %for.cond.1 + +for.end: ; preds = %for.cond.1 + br label %for.inc.6 + +for.inc.6: ; preds = %for.end + %inc7 = add nuw nsw i64 %i.0, 1 + br label %for.cond + +for.end.8: ; preds = %for.cond + ret void +}