Index: include/polly/CodeGen/IslNodeBuilder.h =================================================================== --- include/polly/CodeGen/IslNodeBuilder.h +++ include/polly/CodeGen/IslNodeBuilder.h @@ -42,6 +42,9 @@ void addParameters(__isl_take isl_set *Context); void create(__isl_take isl_ast_node *Node); + /// @brief Preload all memory loads that are invariant. + void preloadInvariantLoads(); + /// @brief Finalize code generation for the SCoP @p S. /// /// @see BlockGenerator::finalizeSCoP(Scop &S) @@ -190,6 +193,24 @@ /// @param Mark The node we generate code for. virtual void createMark(__isl_take isl_ast_node *Marker); virtual void createFor(__isl_take isl_ast_node *For); + + /// @brief Preload the memory load access @p MA. + /// + /// If @p MA is not always executed it will be conditionally loaded and + /// merged with undef from the same type. Hence, if @p MA is executed only + /// under condition C then the preload code will look like this: + /// + /// MA_preload = undef; + /// if (C) + /// MA_preload = load MA; + /// use MA_preload + /// + /// Note that if the runtime context implies the condition C conditional + /// execution is not necessary. + Value *preloadInvariantLoad(const MemoryAccess &MA, + __isl_keep isl_ast_build *Build, + __isl_keep isl_set *RuntimeContext); + void createForVector(__isl_take isl_ast_node *For, int VectorWidth); void createForSequential(__isl_take isl_ast_node *For); Index: include/polly/ScopDetection.h =================================================================== --- include/polly/ScopDetection.h +++ include/polly/ScopDetection.h @@ -145,6 +145,10 @@ using BoxedLoopsMapTy = DenseMap; BoxedLoopsMapTy BoxedLoopsMap; + /// @brief Map to remember loads that are required to be invariant. + DenseMap> + RequiredInvariantLoadsMap; + /// @brief Context variables for SCoP detection. struct DetectionContext { Region &CurRegion; // The region to check. @@ -180,12 +184,16 @@ /// @brief The set of loops contained in non-affine regions. BoxedLoopsSetTy &BoxedLoopsSet; + /// @brief Loads that need to be invariant during execution. + SmallPtrSetImpl &RequiredInvariantLoads; + DetectionContext(Region &R, AliasAnalysis &AA, NonAffineSubRegionSetTy &NASRS, BoxedLoopsSetTy &BLS, + SmallPtrSetImpl &RequiredInvariantLoads, bool Verify) : CurRegion(R), AST(AA), Verifying(Verify), Log(&R), hasLoads(false), hasStores(false), hasAffineLoops(false), NonAffineSubRegionSet(NASRS), - BoxedLoopsSet(BLS) {} + BoxedLoopsSet(BLS), RequiredInvariantLoads(RequiredInvariantLoads) {} }; // Remember the valid regions @@ -236,6 +244,19 @@ /// @return True if the call instruction is valid, false otherwise. static bool isValidCallInst(CallInst &CI); + /// @brief Check if the given loads could be invariant and can be hoisted. + /// + /// If true is returned the loads are added to the required invariant loads + /// contained in the @p Context. + /// + /// @param RequiredInvariantLoads The loads to check. + /// @param Context The current detection context. + /// + /// @return True if all loads can be assumed invariant. + bool onlyValidRequiredInvariantLoads( + SmallPtrSetImpl &RequiredInvariantLoads, + DetectionContext &Context) const; + /// @brief Check if a value is invariant in the region Reg. /// /// @param Val Value to check for invariance. @@ -271,6 +292,18 @@ /// @return True if the instruction is valid, false otherwise. bool isValidInstruction(Instruction &Inst, DetectionContext &Context) const; + /// @brief Check if the SCEV @p S is affine in the current @p Context. + /// + /// This will also use a heuristic to decide if we want to require loads to be + /// invariant to make the expression affine or if we want to treat is as + /// non-affine. + /// + /// @param S The expression to be checked. + /// @param Context The context of scop detection. + /// @param BaseAddress The base address of the expression @p S (if any). + bool isAffine(const SCEV *S, DetectionContext &Context, + Value *BaseAddress = nullptr) const; + /// @brief Check if the control flow in a basic block is valid. /// /// @param BB The BB to check the control flow. @@ -338,6 +371,10 @@ /// @brief Return the set of loops in non-affine subregions for @p R. const BoxedLoopsSetTy *getBoxedLoops(const Region *R) const; + /// @brief Return the set of required invariant loads for @p R. + const SmallPtrSetImpl * + getRequiredInvariantLoads(const Region *R) const; + /// @brief Return true if @p SubR is a non-affine subregion in @p ScopR. bool isNonAffineSubRegion(const Region *SubR, const Region *ScopR) const; Index: include/polly/ScopInfo.h =================================================================== --- include/polly/ScopInfo.h +++ include/polly/ScopInfo.h @@ -127,6 +127,9 @@ /// @brief Destructor to free the isl id of the base pointer. ~ScopArrayInfo(); + /// @brief Set the base pointer to @p BP. + void setBasePtr(Value *BP) { BasePtr = BP; } + /// @brief Return the base pointer. Value *getBasePtr() const { return BasePtr; } @@ -640,6 +643,9 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, MemoryAccess::ReductionType RT); +/// @brief List to hold all (scalar) memory accesses mapped to an instruction. +using MemoryAccessList = std::forward_list; + ///===----------------------------------------------------------------------===// /// @brief Statement of the Scop /// @@ -650,9 +656,6 @@ /// At the moment every statement represents a single basic block of LLVM-IR. class ScopStmt { public: - /// @brief List to hold all (scalar) memory accesses mapped to an instruction. - using MemoryAccessList = std::forward_list; - ScopStmt(const ScopStmt &) = delete; const ScopStmt &operator=(const ScopStmt &) = delete; @@ -830,6 +833,9 @@ /// @brief Return true if this statement represents a whole region. bool isRegionStmt() const { return R != nullptr; } + /// @brief Return true if this statement does not contain any accesses. + bool isEmpty() const { return MemAccs.empty(); } + /// @brief Return the (scalar) memory accesses for @p Inst. const MemoryAccessList &getAccessesFor(const Instruction *Inst) const { MemoryAccessList *MAL = lookupAccessesFor(Inst); @@ -863,6 +869,12 @@ BB = Block; } + /// @brief Move the memory access @p MA from this statement to @p TargetList. + /// + /// Note that scalar accesses that are caused by @p MA will be eliminated in + /// the whole SCoP. + void hoistMemoryAccess(MemoryAccess &MA, MemoryAccessList &TargetList); + typedef MemoryAccessVec::iterator iterator; typedef MemoryAccessVec::const_iterator const_iterator; @@ -973,7 +985,7 @@ /// Max loop depth. unsigned MaxLoopDepth; - typedef std::deque StmtSet; + typedef std::list StmtSet; /// The statements in this Scop. StmtSet Stmts; @@ -1080,6 +1092,9 @@ /// group to ensure the SCoP is executed in an alias free environment. MinMaxVectorPairVectorTy MinMaxAliasGroups; + /// @brief List of invariant accesses. + MemoryAccessList InvariantAccesses; + /// @brief Scop constructor; invoked from ScopInfo::buildScop. Scop(Region &R, AccFuncMapType &AccFuncMap, ScalarEvolution &SE, DominatorTree &DT, isl_ctx *ctx, unsigned MaxLoopDepth); @@ -1133,6 +1148,16 @@ /// @brief Add parameter constraints to @p C that imply a non-empty domain. __isl_give isl_set *addNonEmptyDomainConstraints(__isl_take isl_set *C) const; + /// @brief TODO + void simplifySCoP(); + + /// @brief Mark all invariant memory loads and check for required ones. + /// + /// During SCoP detection loads can be tagged as "required invariant". In such + /// a case we need to verify here that we actually tagged them as invariant, + /// otherwise the SCoP is not valid and has to be dropped. + void collectAndCheckInvariantLoads(ScopDetection &SD); + /// @brief Build the Context of the Scop. void buildContext(); @@ -1263,6 +1288,11 @@ /// @return The maximum depth of the loop. inline unsigned getMaxLoopDepth() const { return MaxLoopDepth; } + /// @brief Return the set of invariant accesses. + const MemoryAccessList &getInvariantAccesses() const { + return InvariantAccesses; + } + /// @brief Mark the SCoP as optimized by the scheduler. void markAsOptimized() { IsOptimized = true; } @@ -1522,8 +1552,10 @@ /// @param L The parent loop of the instruction /// @param R The region on which to build the data access dictionary. /// @param BoxedLoops The set of loops that are overapproximated in @p R. + /// @param InvariantLoads The set of invariant loads for this SCoP. void buildMemoryAccess(Instruction *Inst, Loop *L, Region *R, - const ScopDetection::BoxedLoopsSetTy *BoxedLoops); + const ScopDetection::BoxedLoopsSetTy *BoxedLoops, + const SmallPtrSetImpl &InvariantLoads); /// @brief Analyze and extract the cross-BB scalar dependences (or, /// dataflow dependencies) of an instruction. Index: include/polly/Support/SCEVValidator.h =================================================================== --- include/polly/Support/SCEVValidator.h +++ include/polly/Support/SCEVValidator.h @@ -21,6 +21,7 @@ class ScalarEvolution; class Value; class Loop; +class LoadInst; } namespace polly { @@ -44,9 +45,10 @@ /// @param S The SCEV to analyze. /// @param R The region in which we look for dependences. bool hasScalarDepsInsideRegion(const llvm::SCEV *S, const llvm::Region *R); -bool isAffineExpr(const llvm::Region *R, const llvm::SCEV *Expression, - llvm::ScalarEvolution &SE, - const llvm::Value *BaseAddress = 0); +bool isAffineExpr( + const llvm::Region *R, const llvm::SCEV *Expression, + llvm::ScalarEvolution &SE, const llvm::Value *BaseAddress = 0, + llvm::SmallPtrSetImpl *RequiredInvariantLoads = nullptr); std::vector getParamsInAffineExpr(const llvm::Region *R, const llvm::SCEV *Expression, llvm::ScalarEvolution &SE, Index: include/polly/Support/ScopHelper.h =================================================================== --- include/polly/Support/ScopHelper.h +++ include/polly/Support/ScopHelper.h @@ -14,9 +14,13 @@ #ifndef POLLY_SUPPORT_IRHELPER_H #define POLLY_SUPPORT_IRHELPER_H +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Analysis/AliasAnalysis.h" + namespace llvm { class Type; class Instruction; +class LoadInst; class LoopInfo; class Loop; class ScalarEvolution; @@ -113,5 +117,18 @@ /// /// @return True if the block is a error block, false otherwise. bool isErrorBlock(llvm::BasicBlock &BB); + +/// @brief Check if the given loads could be invariant and can be hoisted. +/// +/// @param Loads The loads to check. +/// @param R The analyzed region. +/// @param AA The alias analysis. +/// @param LI The loop info. +/// @param SE The scalar evolution analysis. +/// +/// @return True if all loads can be assumed invariant. +bool onlyHoistableInvariantLoads(llvm::SmallPtrSetImpl &Loads, + llvm::Region &R, llvm::AliasAnalysis &AA, + llvm::LoopInfo &LI, llvm::ScalarEvolution &SE); } #endif Index: lib/Analysis/ScopDetection.cpp =================================================================== --- lib/Analysis/ScopDetection.cpp +++ lib/Analysis/ScopDetection.cpp @@ -255,9 +255,10 @@ if (Verify) { BoxedLoopsSetTy DummyBoxedLoopsSet; NonAffineSubRegionSetTy DummyNonAffineSubRegionSet; + SmallPtrSet DummyRequiredInvariantLoads; DetectionContext Context(const_cast(R), *AA, DummyNonAffineSubRegionSet, DummyBoxedLoopsSet, - false /*verifying*/); + DummyRequiredInvariantLoads, false /*verifying*/); return isValidRegion(Context); } @@ -299,6 +300,34 @@ return (AllowNonAffineSubLoops || Context.BoxedLoopsSet.empty()); } +bool ScopDetection::onlyValidRequiredInvariantLoads( + SmallPtrSetImpl &RequiredInvariantLoads, + DetectionContext &Context) const { + + if (!onlyHoistableInvariantLoads(RequiredInvariantLoads, Context.CurRegion, + *AA, *LI, *SE)) + return false; + + Context.RequiredInvariantLoads.insert(RequiredInvariantLoads.begin(), + RequiredInvariantLoads.end()); + + return true; +} + +bool ScopDetection::isAffine(const SCEV *S, DetectionContext &Context, + Value *BaseAddress) const { + + SmallPtrSet RequiredInvariantLoads; + if (!isAffineExpr(&Context.CurRegion, S, *SE, BaseAddress, + &RequiredInvariantLoads)) + return false; + + if (!onlyValidRequiredInvariantLoads(RequiredInvariantLoads, Context)) + return false; + + return true; +} + bool ScopDetection::isValidCFG(BasicBlock &BB, DetectionContext &Context) const { Region &CurRegion = Context.CurRegion; @@ -314,6 +343,13 @@ if (!Br) return invalid(Context, /*Assert=*/true, &BB); + Region *R = RI->getRegionFor(&BB); + while (CurRegion.contains(R)) { + if (Context.NonAffineSubRegionSet.count(R)) + return true; + R = R->getParent(); + } + if (Br->isUnconditional()) return true; @@ -357,8 +393,7 @@ const SCEV *LHS = SE->getSCEVAtScope(ICmp->getOperand(0), L); const SCEV *RHS = SE->getSCEVAtScope(ICmp->getOperand(1), L); - if (!isAffineExpr(&CurRegion, LHS, *SE) || - !isAffineExpr(&CurRegion, RHS, *SE)) { + if (!isAffine(LHS, Context) || !isAffine(RHS, Context)) { if (!AllowNonAffineSubRegions || !addOverApproximatedRegion(RI->getRegionFor(&BB), Context)) return invalid(Context, /*Assert=*/true, &BB, LHS, @@ -414,18 +449,6 @@ if (!isInvariant(*Operand, Reg)) return false; - // When the instruction is a load instruction, check that no write to memory - // in the region aliases with the load. - if (const LoadInst *LI = dyn_cast(I)) { - auto Loc = MemoryLocation::get(LI); - - // Check if any basic block in the region can modify the location pointed to - // by 'Loc'. If so, 'Val' is (likely) not invariant in the region. - for (const BasicBlock *BB : Reg.blocks()) - if (AA->canBasicBlockModify(*BB, Loc)) - return false; - } - return true; } @@ -509,7 +532,7 @@ const Instruction *Insn = Pair.first; const SCEV *AF = Pair.second; - if (!isAffineExpr(&CurRegion, AF, *SE, BaseValue)) { + if (!isAffine(AF, Context, BaseValue)) { invalid(Context, /*Assert=*/true, AF, Insn, BaseValue); if (!KeepGoing) @@ -536,7 +559,7 @@ MemAcc *Acc = &TempMemoryAccesses.find(Insn)->second; if (!AF) { - if (isAffineExpr(&CurRegion, Pair.second, *SE, BaseValue)) + if (isAffine(Pair.second, Context, BaseValue)) Acc->DelinearizedSubscripts.push_back(Pair.second); else IsNonAffine = true; @@ -546,7 +569,7 @@ if (Acc->DelinearizedSubscripts.size() == 0) IsNonAffine = true; for (const SCEV *S : Acc->DelinearizedSubscripts) - if (!isAffineExpr(&CurRegion, S, *SE, BaseValue)) + if (!isAffine(S, Context, BaseValue)) IsNonAffine = true; } @@ -617,11 +640,11 @@ if (PollyDelinearize && !isVariantInNonAffineLoop) { Context.Accesses[BasePointer].push_back({&Inst, AccessFunction}); - if (!isAffineExpr(&CurRegion, AccessFunction, *SE, BaseValue)) + if (!isAffine(AccessFunction, Context, BaseValue)) Context.NonAffineAccesses.insert(BasePointer); } else if (!AllowNonAffine) { if (isVariantInNonAffineLoop || - !isAffineExpr(&CurRegion, AccessFunction, *SE, BaseValue)) + !isAffine(AccessFunction, Context, BaseValue)) return invalid(Context, /*Assert=*/true, AccessFunction, &Inst, BaseValue); } @@ -778,7 +801,8 @@ while (ExpandedRegion) { DetectionContext Context( *ExpandedRegion, *AA, NonAffineSubRegionMap[ExpandedRegion.get()], - BoxedLoopsMap[ExpandedRegion.get()], false /* verifying */); + BoxedLoopsMap[ExpandedRegion.get()], + RequiredInvariantLoadsMap[ExpandedRegion.get()], false /* verifying */); DEBUG(dbgs() << "\t\tTrying " << ExpandedRegion->getNameStr() << "\n"); // Only expand when we did not collect errors. @@ -841,7 +865,7 @@ void ScopDetection::findScops(Region &R) { DetectionContext Context(R, *AA, NonAffineSubRegionMap[&R], BoxedLoopsMap[&R], - false /*verifying*/); + RequiredInvariantLoadsMap[&R], false /*verifying*/); bool RegionIsValid = false; if (!DetectUnprofitable && regionWithoutLoops(R, LI)) @@ -1083,14 +1107,23 @@ return &BLMIt->second; } +const SmallPtrSetImpl * +ScopDetection::getRequiredInvariantLoads(const Region *R) const { + auto RILMIt = RequiredInvariantLoadsMap.find(R); + if (RILMIt == RequiredInvariantLoadsMap.end()) + return nullptr; + return &RILMIt->second; +} + void polly::ScopDetection::verifyRegion(const Region &R) const { assert(isMaxRegionInScop(R) && "Expect R is a valid region."); BoxedLoopsSetTy DummyBoxedLoopsSet; NonAffineSubRegionSetTy DummyNonAffineSubRegionSet; + SmallPtrSet DummyRequiredInvariantLoads; DetectionContext Context(const_cast(R), *AA, DummyNonAffineSubRegionSet, DummyBoxedLoopsSet, - true /*verifying*/); + DummyRequiredInvariantLoads, true /*verifying*/); isValidRegion(Context); } Index: lib/Analysis/ScopInfo.cpp =================================================================== --- lib/Analysis/ScopInfo.cpp +++ lib/Analysis/ScopInfo.cpp @@ -58,6 +58,7 @@ #define DEBUG_TYPE "polly-scops" STATISTIC(ScopFound, "Number of valid Scops"); +STATISTIC(InvariantLoads, "Number of invariant loads"); STATISTIC(RichScopFound, "Number of Scops containing a loop"); static cl::opt ModelReadOnlyScalars( @@ -1025,7 +1026,16 @@ auto Expr = Subscripts[i + IndexOffset]; auto Size = Sizes[i]; - if (!isAffineExpr(&Parent.getRegion(), Expr, SE)) + SmallPtrSet RequiredInvariantLoads; + if (!isAffineExpr(&Parent.getRegion(), Expr, SE, nullptr, + &RequiredInvariantLoads)) + continue; + + // TODO: This is conservative but we need to adjust the order in which + // we look for invariant loads and create assumptions from GEPs + // in order to check if all elements of RequiredInvariantLoads are + // actually invariant and hoisted. + if (!RequiredInvariantLoads.empty()) continue; isl_pw_aff *AccessOffset = getPwAff(Expr); @@ -1282,6 +1292,40 @@ void ScopStmt::dump() const { print(dbgs()); } +void ScopStmt::hoistMemoryAccess(MemoryAccess &MA, + MemoryAccessList &TargetList) { + auto &MAL = *lookupAccessesFor(MA.getAccessInstruction()); + MAL.reverse(); + + auto MALIt = MAL.begin(); + auto MALEnd = MAL.end(); + auto MemAccsIt = MemAccs.begin(); + while (MALIt != MALEnd) { + while (*MemAccsIt != *MALIt) + MemAccsIt++; + + MALIt++; + MemAccs.erase(MemAccsIt); + } + + TargetList.push_front(&MA); + InstructionToAccess.erase(MA.getAccessInstruction()); + + // Encode the statement domain in the memory access function of MA as we + // might delete Stmt later on but need to access the domain again during + // code generation. Though, we only need the parameter constraints as the + // access is invariant in all surrounding loops anyway. + ScopStmt &Stmt = *MA.getStatement(); + isl_map *AR = MA.getAccessRelation(); + AR = isl_map_remove_dims(AR, isl_dim_in, 0, isl_map_n_in(AR)); + AR = isl_map_reset_tuple_id(AR, isl_dim_in); + AR = isl_map_intersect_params(AR, isl_set_params(Stmt.getDomain())); + AR = isl_map_detect_equalities(AR); + MA.setNewAccessRelation(isl_map_coalesce(AR)); + + delete &MAL; +} + //===----------------------------------------------------------------------===// /// Scop class implement @@ -2199,6 +2243,9 @@ buildBoundaryContext(); simplifyContexts(); buildAliasChecks(AA); + + collectAndCheckInvariantLoads(SD); + simplifySCoP(); } Scop::~Scop() { @@ -2229,6 +2276,100 @@ Access->updateDimensionality(); } +void Scop::simplifySCoP() { + + for (auto StmtIt = Stmts.begin(), StmtEnd = Stmts.end(); StmtIt != StmtEnd;) { + ScopStmt &Stmt = *StmtIt; + + if (!StmtIt->isEmpty()) { + StmtIt++; + continue; + } + + if (Stmt.isRegionStmt()) + for (BasicBlock *BB : Stmt.getRegion()->blocks()) + StmtMap.erase(BB); + else + StmtMap.erase(Stmt.getBasicBlock()); + + StmtIt = Stmts.erase(StmtIt); + } +} + +void Scop::collectAndCheckInvariantLoads(ScopDetection &SD) { + isl_union_map *Writes = getWrites(); + for (ScopStmt &Stmt : *this) { + + // TODO: Loads that are not loop carried, hence are in a statement with + // zero iterators, are by construction invariant, though we + // currently "hoist" them anyway. This is necessary because we allow + // them to be treated as parameters (e.g., in conditions) and our code + // generation would otherwise use the old value. + + isl_set *Domain = Stmt.getDomain(); + SmallVector InvMAs; + + for (MemoryAccess *MA : Stmt) { + if (MA->isImplicit() || MA->isWrite() || !MA->isAffine()) + continue; + + isl_map *AccessRelation = MA->getAccessRelation(); + if (isl_map_involves_dims(AccessRelation, isl_dim_in, 0, + Stmt.getNumIterators())) { + isl_map_free(AccessRelation); + continue; + } + + AccessRelation = + isl_map_intersect_domain(AccessRelation, isl_set_copy(Domain)); + isl_set *AccessRange = isl_map_range(AccessRelation); + + isl_union_map *Written = isl_union_map_intersect_range( + isl_union_map_copy(Writes), isl_union_set_from_set(AccessRange)); + bool IsWritten = !isl_union_map_is_empty(Written); + isl_union_map_free(Written); + + if (IsWritten) + continue; + + InvMAs.push_back(MA); + } + + InvariantLoads += InvMAs.size(); + + // Transfer the memory access from the statement to the SCoP. + for (MemoryAccess *MA : InvMAs) + Stmt.hoistMemoryAccess(*MA, InvariantAccesses); + + isl_set_free(Domain); + } + isl_union_map_free(Writes); + + if (!InvariantAccesses.empty()) + IsOptimized = true; + + // We inserted invariant accesses always in the front but need them to be + // sorted in a "natural order". The statements are already sorted in reverse + // post order and that suffices for the accesses too. The reason we require + // an order in the first place is the dependences between invariant loads + // that can be caused by indirect loads but also conditionally executed onces + // where the condition is also an invariant load. + InvariantAccesses.reverse(); + + // Check required invariant loads that were tagged during SCoP detection. + for (LoadInst *LI : *SD.getRequiredInvariantLoads(&getRegion())) { + ScopStmt *Stmt = getStmtForBasicBlock(LI->getParent()); + if (Stmt->lookupAccessesFor(LI) != nullptr) { + DEBUG(errs() << "\n\nWARNING: Load (" << *LI + << ") is required to be invariant but was not marked as " + "such. SCoP for " + << getRegion() << " will be dropped\n\n"); + addAssumption(isl_set_empty(getParamSpace())); + return; + } + } +} + const ScopArrayInfo * Scop::getOrCreateScopArrayInfo(Value *BasePtr, Type *AccessType, ArrayRef Sizes, bool IsPHI) { @@ -2409,6 +2550,10 @@ << "\n"; OS.indent(4) << "Region: " << getNameStr() << "\n"; OS.indent(4) << "Max Loop Depth: " << getMaxLoopDepth() << "\n"; + OS.indent(4) << "Invariant Accesses: {\n"; + for (const MemoryAccess *MA : InvariantAccesses) + MA->print(OS); + OS.indent(4) << "}\n"; printContext(OS.indent(4)); printArrayInfo(OS.indent(4)); printAliasAssumptions(OS); @@ -2848,7 +2993,8 @@ void ScopInfo::buildMemoryAccess( Instruction *Inst, Loop *L, Region *R, - const ScopDetection::BoxedLoopsSetTy *BoxedLoops) { + const ScopDetection::BoxedLoopsSetTy *BoxedLoops, + const SmallPtrSetImpl &InvariantLoads) { unsigned Size; Type *SizeType; Value *Val; @@ -2895,11 +3041,18 @@ std::vector SizesSCEV; bool AllAffineSubcripts = true; - for (auto Subscript : Subscripts) - if (!isAffineExpr(R, Subscript, *SE)) { - AllAffineSubcripts = false; + for (auto Subscript : Subscripts) { + SmallPtrSet RequiredInvariantLoads; + AllAffineSubcripts = + isAffineExpr(R, Subscript, *SE, nullptr, &RequiredInvariantLoads); + + for (auto *LInst : RequiredInvariantLoads) + if (!InvariantLoads.count(LInst)) + AllAffineSubcripts = false; + + if (!AllAffineSubcripts) break; - } + } if (AllAffineSubcripts && Sizes.size() > 0) { for (auto V : Sizes) @@ -2933,8 +3086,14 @@ isVariantInNonAffineLoop = true; } + SmallPtrSet RequiredInvariantLoads; bool IsAffine = !isVariantInNonAffineLoop && - isAffineExpr(R, AccessFunction, *SE, BasePointer->getValue()); + isAffineExpr(R, AccessFunction, *SE, BasePointer->getValue(), + &RequiredInvariantLoads); + + for (auto *LInst : RequiredInvariantLoads) + if (!InvariantLoads.count(LInst)) + IsAffine = false; // FIXME: Size of the number of bytes of an array element, not the number of // elements as probably intended here. @@ -2971,6 +3130,9 @@ // The set of loops contained in non-affine subregions that are part of R. const ScopDetection::BoxedLoopsSetTy *BoxedLoops = SD->getBoxedLoops(&R); + // The set of loads that are required to be invariant. + auto &RequiredInvariantLoads = *SD->getRequiredInvariantLoads(&R); + for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I) { Instruction *Inst = I; @@ -2982,12 +3144,21 @@ if (!PHI && IsExitBlock) break; + // TODO: At this point we only know that RequiredInvariantLoads have to be + // invariant and hoisted for the SCoP to be processed. Though, there + // might be other invariant accesses that will be hoisted and that + // would allow to make a non-affine access affine. if (isa(Inst) || isa(Inst)) - buildMemoryAccess(Inst, L, &R, BoxedLoops); + buildMemoryAccess(Inst, L, &R, BoxedLoops, RequiredInvariantLoads); if (isIgnoredIntrinsic(Inst)) continue; + // Do not build scalar dependences for required invariant loads as we will + // hoist them later on anyway or drop the SCoP if we cannot. + if (RequiredInvariantLoads.count(dyn_cast(Inst))) + continue; + if (buildScalarDependences(Inst, &R, NonAffineSubRegion)) { if (!isa(Inst)) addScalarWriteAccess(Inst); Index: lib/CodeGen/BlockGenerators.cpp =================================================================== --- lib/CodeGen/BlockGenerators.cpp +++ lib/CodeGen/BlockGenerators.cpp @@ -1,4 +1,4 @@ -//===--- BlockGenerators.cpp - Generate code for statements -----*- C++ -*-===// + // // The LLVM Compiler Infrastructure // @@ -107,6 +107,8 @@ return const_cast(Old); if (Value *New = GlobalMap.lookup(Old)) { + if (Value *NewRemapped = GlobalMap.lookup(New)) + New = NewRemapped; if (Old->getType()->getScalarSizeInBits() < New->getType()->getScalarSizeInBits()) New = Builder.CreateTruncOrBitCast(New, Old->getType()); @@ -209,6 +211,9 @@ Value *BlockGenerator::generateScalarLoad(ScopStmt &Stmt, const LoadInst *Load, ValueMapT &BBMap, LoopToScevMapT <S, isl_id_to_ast_expr *NewAccesses) { + if (Value *PreloadLoad = GlobalMap.lookup(Load)) + return PreloadLoad; + const Value *Pointer = Load->getPointerOperand(); Value *NewPointer = generateLocationAccessed(Stmt, Load, Pointer, BBMap, LTS, NewAccesses); @@ -688,9 +693,7 @@ Value *VectorBlockGenerator::generateUnknownStrideLoad( ScopStmt &Stmt, const LoadInst *Load, VectorValueMapT &ScalarMaps, - __isl_keep isl_id_to_ast_expr *NewAccesses - - ) { + __isl_keep isl_id_to_ast_expr *NewAccesses) { int VectorWidth = getVectorWidth(); const Value *Pointer = Load->getPointerOperand(); VectorType *VectorType = VectorType::get( @@ -713,6 +716,12 @@ void VectorBlockGenerator::generateLoad( ScopStmt &Stmt, const LoadInst *Load, ValueMapT &VectorMap, VectorValueMapT &ScalarMaps, __isl_keep isl_id_to_ast_expr *NewAccesses) { + if (Value *PreloadLoad = GlobalMap.lookup(Load)) { + VectorMap[Load] = Builder.CreateVectorSplat(getVectorWidth(), PreloadLoad, + Load->getName() + "_p"); + return; + } + if (!VectorType::isValidElementType(Load->getType())) { for (int i = 0; i < getVectorWidth(); i++) ScalarMaps[i][Load] = Index: lib/CodeGen/CodeGeneration.cpp =================================================================== --- lib/CodeGen/CodeGeneration.cpp +++ lib/CodeGen/CodeGeneration.cpp @@ -145,8 +145,9 @@ auto SplitBlock = StartBlock->getSinglePredecessor(); Builder.SetInsertPoint(SplitBlock->getTerminator()); NodeBuilder.addParameters(S.getContext()); + NodeBuilder.preloadInvariantLoads(); Value *RTC = buildRTC(Builder, NodeBuilder.getExprBuilder()); - SplitBlock->getTerminator()->setOperand(0, RTC); + Builder.GetInsertBlock()->getTerminator()->setOperand(0, RTC); Builder.SetInsertPoint(StartBlock->begin()); NodeBuilder.create(AstRoot); Index: lib/CodeGen/IslNodeBuilder.cpp =================================================================== --- lib/CodeGen/IslNodeBuilder.cpp +++ lib/CodeGen/IslNodeBuilder.cpp @@ -178,6 +178,7 @@ SetVector &Values; SetVector &SCEVs; BlockGenerator &BlockGen; + ValueMapT &ValueMap; }; /// @brief Extract the values and SCEVs needed to generate code for a block. @@ -220,7 +221,8 @@ if (Access->isExplicit()) { auto *BasePtr = Access->getScopArrayInfo()->getBasePtr(); if (Instruction *OpInst = dyn_cast(BasePtr)) - if (Stmt->getParent()->getRegion().contains(OpInst)) + if (Stmt->getParent()->getRegion().contains(OpInst) && + !References.ValueMap.count(OpInst)) continue; References.Values.insert(BasePtr); @@ -282,8 +284,8 @@ SetVector &Loops) { SetVector SCEVs; - struct SubtreeReferences References = {LI, SE, S.getRegion(), - Values, SCEVs, getBlockGenerator()}; + struct SubtreeReferences References = { + LI, SE, S.getRegion(), Values, SCEVs, getBlockGenerator(), ValueMap}; for (const auto &I : IDToValue) Values.insert(I.second); @@ -814,12 +816,129 @@ llvm_unreachable("Unknown isl_ast_node type"); } +/// @brief Create the actual preload memory access for @p MA. +static inline Value *createPreloadLoad(Scop &S, const MemoryAccess &MA, + isl_ast_build *Build, + IslExprBuilder &ExprBuilder) { + isl_set *AccessRange = isl_map_range(MA.getAccessRelation()); + isl_set_dump(AccessRange); + isl_pw_multi_aff *PWAccRel = isl_pw_multi_aff_from_set(AccessRange); + PWAccRel = isl_pw_multi_aff_gist_params(PWAccRel, S.getContext()); + isl_ast_expr *Access = + isl_ast_build_access_from_pw_multi_aff(Build, PWAccRel); + return ExprBuilder.create(Access); +} + +Value *IslNodeBuilder::preloadInvariantLoad(const MemoryAccess &MA, + isl_ast_build *Build, + isl_set *RuntimeContext) { + + isl_set *Domain = isl_map_params(MA.getAccessRelation()); + + if (isl_set_is_subset(RuntimeContext, Domain)) { + isl_set_free(Domain); + return createPreloadLoad(S, MA, Build, ExprBuilder); + } else { + + isl_ast_expr *DomainCond = isl_ast_build_expr_from_set(Build, Domain); + + Value *Cond = ExprBuilder.create(DomainCond); + if (!Cond->getType()->isIntegerTy(1)) + Cond = Builder.CreateIsNotNull(Cond); + + BasicBlock *CondBB = SplitBlock(Builder.GetInsertBlock(), + Builder.GetInsertPoint(), &DT, &LI); + CondBB->setName("polly.preload.cond"); + + BasicBlock *MergeBB = SplitBlock(CondBB, CondBB->begin(), &DT, &LI); + MergeBB->setName("polly.preload.merge"); + + Function *F = Builder.GetInsertBlock()->getParent(); + LLVMContext &Context = F->getContext(); + BasicBlock *ExecBB = BasicBlock::Create(Context, "polly.preload.exec", F); + + DT.addNewBlock(ExecBB, CondBB); + if (Loop *L = LI.getLoopFor(CondBB)) + L->addBasicBlockToLoop(ExecBB, LI); + + auto *CondBBTerminator = CondBB->getTerminator(); + Builder.SetInsertPoint(CondBBTerminator); + Builder.CreateCondBr(Cond, ExecBB, MergeBB); + CondBBTerminator->eraseFromParent(); + + Builder.SetInsertPoint(ExecBB); + Builder.CreateBr(MergeBB); + + Builder.SetInsertPoint(ExecBB->getTerminator()); + Instruction *AccInst = MA.getAccessInstruction(); + Type *AccInstTy = AccInst->getType(); + Value *PreAccInst = createPreloadLoad(S, MA, Build, ExprBuilder); + + Builder.SetInsertPoint(MergeBB->getTerminator()); + auto *MergePHI = Builder.CreatePHI( + AccInstTy, 2, "polly.preload." + AccInst->getName() + ".merge"); + MergePHI->addIncoming(PreAccInst, ExecBB); + MergePHI->addIncoming(UndefValue::get(AccInstTy), CondBB); + + return MergePHI; + } +} + +void IslNodeBuilder::preloadInvariantLoads() { + + const auto &InvAccList = S.getInvariantAccesses(); + if (InvAccList.empty()) + return; + + const Region &R = S.getRegion(); + + BasicBlock *PreLoadBB = + SplitBlock(Builder.GetInsertBlock(), Builder.GetInsertPoint(), &DT, &LI); + PreLoadBB->setName("polly.preload.begin"); + Builder.SetInsertPoint(PreLoadBB->begin()); + + isl_ast_build *Build = + isl_ast_build_from_context(isl_set_universe(S.getParamSpace())); + isl_set *RuntimeContext = S.getRuntimeCheckContext(); + + for (auto *MA : InvAccList) { + if (MA->isImplicit()) + continue; + Instruction *AccInst = MA->getAccessInstruction(); + Value *PreloadVal = preloadInvariantLoad(*MA, Build, RuntimeContext); + ValueMap[AccInst] = PreloadVal; + + if (SE.isSCEVable(AccInst->getType())) { + isl_id *ParamId = S.getIdForParam(SE.getSCEV(AccInst)); + if (ParamId) + IDToValue[ParamId] = PreloadVal; + isl_id_free(ParamId); + } + + SmallVector Users; + for (auto *U : AccInst->users()) + if (Instruction *UI = dyn_cast(U)) + if (!R.contains(UI)) + Users.push_back(UI); + for (auto *U : Users) + U->replaceUsesOfWith(AccInst, PreloadVal); + + auto *SAI = S.getScopArrayInfo(MA->getBaseAddr()); + for (auto *DerivedSAI : SAI->getDerivedSAIs()) + DerivedSAI->setBasePtr(PreloadVal); + } + + isl_set_free(RuntimeContext); + isl_ast_build_free(Build); +} + void IslNodeBuilder::addParameters(__isl_take isl_set *Context) { for (unsigned i = 0; i < isl_set_dim(Context, isl_dim_param); ++i) { isl_id *Id; Id = isl_set_get_dim_id(Context, isl_dim_param, i); + IDToValue[Id] = generateSCEV((const SCEV *)isl_id_get_user(Id)); isl_id_free(Id); Index: lib/Support/SCEVValidator.cpp =================================================================== --- lib/Support/SCEVValidator.cpp +++ lib/Support/SCEVValidator.cpp @@ -125,10 +125,13 @@ const Region *R; ScalarEvolution &SE; const Value *BaseAddress; + SmallPtrSetImpl *RequiredInvariantLoads; public: - SCEVValidator(const Region *R, ScalarEvolution &SE, const Value *BaseAddress) - : R(R), SE(SE), BaseAddress(BaseAddress) {} + SCEVValidator(const Region *R, ScalarEvolution &SE, const Value *BaseAddress, + SmallPtrSetImpl *RequiredInvariantLoads) + : R(R), SE(SE), BaseAddress(BaseAddress), + RequiredInvariantLoads(RequiredInvariantLoads) {} class ValidatorResult visitConstant(const SCEVConstant *Constant) { return ValidatorResult(SCEVType::INT); @@ -335,6 +338,15 @@ return ValidatorResult(SCEVType::PARAM, S); } + ValidatorResult visitLoadInstruction(Instruction *I, const SCEV *S) { + if (R->contains(I) && RequiredInvariantLoads) { + RequiredInvariantLoads->insert(cast(I)); + return ValidatorResult(SCEVType::PARAM, S); + } + + return visitGenericInst(I, S); + } + ValidatorResult visitSDivInstruction(Instruction *SDiv, const SCEV *S) { assert(SDiv->getOpcode() == Instruction::SDiv && "Assumed SDiv instruction!"); @@ -391,6 +403,8 @@ if (Instruction *I = dyn_cast(Expr->getValue())) { switch (I->getOpcode()) { + case Instruction::Load: + return visitLoadInstruction(I, Expr); case Instruction::SDiv: return visitSDivInstruction(I, Expr); case Instruction::SRem: @@ -550,11 +564,12 @@ } bool isAffineExpr(const Region *R, const SCEV *Expr, ScalarEvolution &SE, - const Value *BaseAddress) { + const Value *BaseAddress, + SmallPtrSetImpl *RequiredInvariantLoads) { if (isa(Expr)) return false; - SCEVValidator Validator(R, SE, BaseAddress); + SCEVValidator Validator(R, SE, BaseAddress, RequiredInvariantLoads); DEBUG({ dbgs() << "\n"; dbgs() << "Expr: " << *Expr << "\n"; @@ -580,7 +595,8 @@ if (isa(Expr)) return std::vector(); - SCEVValidator Validator(R, SE, BaseAddress); + SmallPtrSet RequiredInvariantLoads; + SCEVValidator Validator(R, SE, BaseAddress, &RequiredInvariantLoads); ValidatorResult Result = Validator.visit(Expr); assert(Result.isValid() && "Requested parameters for an invalid SCEV!"); Index: lib/Support/ScopHelper.cpp =================================================================== --- lib/Support/ScopHelper.cpp +++ lib/Support/ScopHelper.cpp @@ -13,7 +13,6 @@ #include "polly/Support/ScopHelper.h" #include "polly/ScopInfo.h" -#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/RegionInfo.h" #include "llvm/Analysis/ScalarEvolution.h" @@ -345,3 +344,29 @@ return false; } + +bool polly::onlyHoistableInvariantLoads(SmallPtrSetImpl &Loads, + Region &R, AliasAnalysis &AA, + LoopInfo &LI, ScalarEvolution &SE) { + + for (LoadInst *LInst : Loads) { + + Loop *L = LI.getLoopFor(LInst->getParent()); + const SCEV *PtrSCEV = SE.getSCEVAtScope(LInst->getPointerOperand(), L); + while (L && R.contains(L)) { + if (!SE.isLoopInvariant(PtrSCEV, L)) + return false; + L = L->getParentLoop(); + } + + auto Loc = MemoryLocation::get(LInst); + + // Check if any basic block in the region can modify the location pointed to + // by 'Loc'. If so, 'Val' is (likely) not invariant in the region. + for (const BasicBlock *BB : R.blocks()) + if (AA.canBasicBlockModify(*BB, Loc)) + return false; + } + + return true; +} Index: test/Isl/CodeGen/aliasing_parametric_simple_2.ll =================================================================== --- test/Isl/CodeGen/aliasing_parametric_simple_2.ll +++ test/Isl/CodeGen/aliasing_parametric_simple_2.ll @@ -6,6 +6,7 @@ ; } ; ; CHECK: sext i32 %c to i64 +; CHECK: sext i32 %c to i64 ; CHECK: %[[M0:[._a-zA-Z0-9]*]] = sext i32 %c to i64 ; CHECK: %[[M1:[._a-zA-Z0-9]*]] = icmp sle i64 %[[M0]], 15 ; CHECK: %[[M2:[._a-zA-Z0-9]*]] = sext i32 %c to i64 @@ -23,7 +24,7 @@ ; CHECK: %[[BMin:[._a-zA-Z0-9]*]] = getelementptr i32, i32* %B, i64 %[[m4]] ; CHECK: %[[AltB:[._a-zA-Z0-9]*]] = icmp ule i32* %[[AMax]], %[[BMin]] ; CHECK: %[[NoAlias:[._a-zA-Z0-9]*]] = or i1 %[[BltA]], %[[AltB]] -; CHECK: %[[RTC:[._a-zA-Z0-9]*]] = and i1 %1, %[[NoAlias]] +; CHECK: %[[RTC:[._a-zA-Z0-9]*]] = and i1 %3, %[[NoAlias]] ; CHECK: br i1 %[[RTC]], label %polly.start, label %for.cond ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" Index: test/Isl/CodeGen/exprModDiv.ll =================================================================== --- test/Isl/CodeGen/exprModDiv.ll +++ test/Isl/CodeGen/exprModDiv.ll @@ -6,7 +6,7 @@ ; ; void exprModDiv(float *A, float *B, float *C, long N, long p) { ; for (long i = 0; i < N; i++) -; C[i] += A[i] + B[i] + A[p] + B[p]; +; C[i] += A[i] + B[i] + A[i] + B[i + p]; ; } ; ; @@ -32,21 +32,21 @@ ; #define floord(n,d) ((n < 0) ? (n - d + 1) : n) / d ; A[p + 127 * floord(-p - 1, 127) + 127] -; CHECK: %20 = sub nsw i64 0, %p -; CHECK: %21 = sub nsw i64 %20, 1 -; CHECK: %pexp.fdiv_q.0 = sub i64 %21, 127 +; CHECK: %17 = sub nsw i64 0, %p +; CHECK: %18 = sub nsw i64 %17, 1 +; CHECK: %pexp.fdiv_q.0 = sub i64 %18, 127 ; CHECK: %pexp.fdiv_q.1 = add i64 %pexp.fdiv_q.0, 1 -; CHECK: %pexp.fdiv_q.2 = icmp slt i64 %21, 0 -; CHECK: %pexp.fdiv_q.3 = select i1 %pexp.fdiv_q.2, i64 %pexp.fdiv_q.1, i64 %21 +; CHECK: %pexp.fdiv_q.2 = icmp slt i64 %18, 0 +; CHECK: %pexp.fdiv_q.3 = select i1 %pexp.fdiv_q.2, i64 %pexp.fdiv_q.1, i64 %18 ; CHECK: %pexp.fdiv_q.4 = sdiv i64 %pexp.fdiv_q.3, 127 -; CHECK: %22 = mul nsw i64 127, %pexp.fdiv_q.4 -; CHECK: %23 = add nsw i64 %p, %22 -; CHECK: %24 = add nsw i64 %23, 127 -; CHECK: %polly.access.A10 = getelementptr float, float* %A, i64 %24 +; CHECK: %19 = mul nsw i64 127, %pexp.fdiv_q.4 +; CHECK: %20 = add nsw i64 %p, %19 +; CHECK: %21 = add nsw i64 %20, 127 +; CHECK: %polly.access.A10 = getelementptr float, float* %A, i64 %21 ; A[p / 127] ; CHECK: %pexp.div = sdiv exact i64 %p, 127 -; CHECK: %polly.access.B12 = getelementptr float, float* %B, i64 %pexp.div +; CHECK: %polly.access.B13 = getelementptr float, float* %B, i64 %pexp.div ; A[i % 128] ; POW2: %pexp.pdiv_r = urem i64 %polly.indvar, 128 @@ -58,17 +58,17 @@ ; #define floord(n,d) ((n < 0) ? (n - d + 1) : n) / d ; A[p + 128 * floord(-p - 1, 128) + 128] -; POW2: %20 = sub nsw i64 0, %p -; POW2: %21 = sub nsw i64 %20, 1 -; POW2: %polly.fdiv_q.shr = ashr i64 %21, 7 -; POW2: %22 = mul nsw i64 128, %polly.fdiv_q.shr -; POW2: %23 = add nsw i64 %p, %22 -; POW2: %24 = add nsw i64 %23, 128 -; POW2: %polly.access.A10 = getelementptr float, float* %A, i64 %24 +; POW2: %17 = sub nsw i64 0, %p +; POW2: %18 = sub nsw i64 %17, 1 +; POW2: %polly.fdiv_q.shr = ashr i64 %18, 7 +; POW2: %19 = mul nsw i64 128, %polly.fdiv_q.shr +; POW2: %20 = add nsw i64 %p, %19 +; POW2: %21 = add nsw i64 %20, 128 +; POW2: %polly.access.A10 = getelementptr float, float* %A, i64 %21 ; A[p / 128] ; POW2: %pexp.div = sdiv exact i64 %p, 128 -; POW2: %polly.access.B12 = getelementptr float, float* %B, i64 %pexp.div +; POW2: %polly.access.B13 = getelementptr float, float* %B, i64 %pexp.div target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -87,10 +87,11 @@ %arrayidx1 = getelementptr inbounds float, float* %B, i64 %i.0 %tmp1 = load float, float* %arrayidx1, align 4 %add = fadd float %tmp, %tmp1 - %arrayidx2 = getelementptr inbounds float, float* %A, i64 %p + %arrayidx2 = getelementptr inbounds float, float* %A, i64 %i.0 %tmp2 = load float, float* %arrayidx2, align 4 %add3 = fadd float %add, %tmp2 - %arrayidx4 = getelementptr inbounds float, float* %B, i64 %p + %padd = add nsw i64 %p, %i.0 + %arrayidx4 = getelementptr inbounds float, float* %B, i64 %padd %tmp3 = load float, float* %arrayidx4, align 4 %add5 = fadd float %add3, %tmp3 %arrayidx6 = getelementptr inbounds float, float* %C, i64 %i.0 Index: test/Isl/CodeGen/invariant_load.ll =================================================================== --- /dev/null +++ test/Isl/CodeGen/invariant_load.ll @@ -0,0 +1,39 @@ +; RUN: opt %loadPolly -polly-detect-unprofitable -polly-no-early-exit -polly-codegen -S < %s | FileCheck %s +; +; CHECK-LABEL: polly.preload.begin: +; CHECK-NEXT: %polly.access.B = getelementptr i32, i32* %B, i64 0 +; CHECK-NEXT: %polly.access.B.load = load i32, i32* %polly.access.B +; +; CHECK-LABEL: polly.stmt.bb2: +; CHECK-NEXT: %scevgep = getelementptr i32, i32* %A, i64 %polly.indvar +; CHECK-NEXT: store i32 %polly.access.B.load, i32* %scevgep, align 4 +; +; void f(int *restrict A, int *restrict B) { +; for (int i = 0; i < 1024; i++) +; A[i] = *B; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* noalias %A, i32* noalias %B) { +bb: + br label %bb1 + +bb1: ; preds = %bb4, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb4 ], [ 0, %bb ] + %exitcond = icmp ne i64 %indvars.iv, 1024 + br i1 %exitcond, label %bb2, label %bb5 + +bb2: ; preds = %bb1 + %tmp = load i32, i32* %B, align 4 + %tmp3 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 %tmp, i32* %tmp3, align 4 + br label %bb4 + +bb4: ; preds = %bb2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb1 + +bb5: ; preds = %bb1 + ret void +} Index: test/Isl/CodeGen/invariant_load_base_pointer.ll =================================================================== --- /dev/null +++ test/Isl/CodeGen/invariant_load_base_pointer.ll @@ -0,0 +1,38 @@ +; RUN: opt %loadPolly -polly-no-early-exit -polly-codegen -polly-ignore-aliasing -polly-detect-unprofitable -S < %s | FileCheck %s +; +; CHECK-LABEL: polly.preload.begin: +; CHECK-NEXT: %polly.access.BPLoc = getelementptr i32*, i32** %BPLoc, i64 0 +; CHECK-NEXT: %polly.access.BPLoc.load = load i32*, i32** %polly.access.BPLoc +; +; CHECK-LABEL: polly.stmt.bb2: +; CHECK-NEXT: %p_tmp3 = getelementptr inbounds i32, i32* %polly.access.BPLoc.load, i64 %polly.indvar +; +; void f(int **BPLoc) { +; for (int i = 0; i < 1024; i++) +; (*BPLoc)[i] = 0; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32** %BPLoc) { +bb: + br label %bb1 + +bb1: ; preds = %bb4, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb4 ], [ 0, %bb ] + %exitcond = icmp ne i64 %indvars.iv, 1024 + br i1 %exitcond, label %bb2, label %bb5 + +bb2: ; preds = %bb1 + %tmp = load i32*, i32** %BPLoc, align 8 + %tmp3 = getelementptr inbounds i32, i32* %tmp, i64 %indvars.iv + store i32 0, i32* %tmp3, align 4 + br label %bb4 + +bb4: ; preds = %bb2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb1 + +bb5: ; preds = %bb1 + ret void +} Index: test/Isl/CodeGen/invariant_load_base_pointer_conditional.ll =================================================================== --- /dev/null +++ test/Isl/CodeGen/invariant_load_base_pointer_conditional.ll @@ -0,0 +1,61 @@ +; RUN: opt %loadPolly -polly-no-early-exit -polly-codegen -polly-ignore-aliasing -polly-detect-unprofitable -S < %s | FileCheck %s +; +; CHECK-LABEL: polly.preload.begin: +; CHECK-NEXT: %0 = sext i32 %N to i64 +; CHECK-NEXT: %1 = icmp sge i64 %0, 514 +; CHECK-NEXT: br label %polly.preload.cond +; +; CHECK-LABEL: polly.preload.cond: +; CHECK-NEXT: br i1 %1, label %polly.preload.exec, label %polly.preload.merge +; +; CHECK-LABEL: polly.preload.merge: +; CHECK-NEXT: %polly.preload.tmp6.merge = phi i32* [ %polly.access.BPLoc.load, %polly.preload.exec ], [ undef, %polly.preload.cond ] +; +; CHECK-LABEL: polly.stmt.bb5: +; CHECK-NEXT: %p_tmp7 = getelementptr inbounds i32, i32* %polly.preload.tmp6.merge, i64 %polly.indvar6 +; +; void f(int **BPLoc, int *A, int N) { +; for (int i = 0; i < N; i++) +; if (i > 512) +; (*BPLoc)[i] = 0; +; else +; A[i] = 0; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32** %BPLoc, i32* %A, i32 %N) { +bb: + %tmp = sext i32 %N to i64 + br label %bb1 + +bb1: ; preds = %bb11, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb11 ], [ 0, %bb ] + %tmp2 = icmp slt i64 %indvars.iv, %tmp + br i1 %tmp2, label %bb3, label %bb12 + +bb3: ; preds = %bb1 + %tmp4 = icmp sgt i64 %indvars.iv, 512 + br i1 %tmp4, label %bb5, label %bb8 + +bb5: ; preds = %bb3 + %tmp6 = load i32*, i32** %BPLoc, align 8 + %tmp7 = getelementptr inbounds i32, i32* %tmp6, i64 %indvars.iv + store i32 0, i32* %tmp7, align 4 + br label %bb10 + +bb8: ; preds = %bb3 + %tmp9 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 0, i32* %tmp9, align 4 + br label %bb10 + +bb10: ; preds = %bb8, %bb5 + br label %bb11 + +bb11: ; preds = %bb10 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb1 + +bb12: ; preds = %bb1 + ret void +} Index: test/Isl/CodeGen/invariant_load_condition.ll =================================================================== --- /dev/null +++ test/Isl/CodeGen/invariant_load_condition.ll @@ -0,0 +1,47 @@ +; RUN: opt %loadPolly -polly-detect-unprofitable -polly-codegen -S < %s | FileCheck %s +; +; CHECK-LABEL: polly.preload.begin: +; CHECK-NEXT: %polly.access.C = getelementptr i32, i32* %C, i64 0 +; CHECK-NEXT: %polly.access.C.load = load i32, i32* %polly.access.C +; CHECK-NOT: %polly.access.C.load = load i32, i32* %polly.access.C +; +; CHECK-LABEL: polly.stmt.bb2: +; CHECK-NEXT: icmp eq i32 %polly.access.C.load, 0 + +; void f(int *A, int *C) { +; for (int i = 0; i < 1024; i++) +; if (*C) +; A[i] = 0; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* %A, i32* %C) { +bb: + br label %bb1 + +bb1: ; preds = %bb7, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb7 ], [ 0, %bb ] + %exitcond = icmp ne i64 %indvars.iv, 1024 + br i1 %exitcond, label %bb2, label %bb8 + +bb2: ; preds = %bb1 + %tmp = load i32, i32* %C, align 4 + %tmp3 = icmp eq i32 %tmp, 0 + br i1 %tmp3, label %bb6, label %bb4 + +bb4: ; preds = %bb2 + %tmp5 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 0, i32* %tmp5, align 4 + br label %bb6 + +bb6: ; preds = %bb2, %bb4 + br label %bb7 + +bb7: ; preds = %bb6 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb1 + +bb8: ; preds = %bb1 + ret void +} Index: test/Isl/CodeGen/invariant_load_loop_ub.ll =================================================================== --- /dev/null +++ test/Isl/CodeGen/invariant_load_loop_ub.ll @@ -0,0 +1,38 @@ +; RUN: opt %loadPolly -polly-codegen -polly-detect-unprofitable -S < %s | FileCheck %s +; +; TODO: This is a negative test as we cannot handle loops with a +; trip count ScalarEvolution cannot handle. However, once +; this changes we should be able to detect this. +; +; CHECK-NOT: polly.start +; +; void f(int *A, int *UB) { +; for (int i = 0; i < *UB; i++) +; A[i] = 0; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* %A, i32* %UB) { +bb: + br label %bb1 + +bb1: ; preds = %bb6, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb6 ], [ 0, %bb ] + %tmp = load i32, i32* %UB, align 4 + %tmp2 = sext i32 %tmp to i64 + %tmp3 = icmp slt i64 %indvars.iv, %tmp2 + br i1 %tmp3, label %bb4, label %bb7 + +bb4: ; preds = %bb1 + %tmp5 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 0, i32* %tmp5, align 4 + br label %bb6 + +bb6: ; preds = %bb4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb1 + +bb7: ; preds = %bb1 + ret void +} Index: test/Isl/CodeGen/invariant_load_ptr_ptr_noalias.ll =================================================================== --- /dev/null +++ test/Isl/CodeGen/invariant_load_ptr_ptr_noalias.ll @@ -0,0 +1,44 @@ +; RUN: opt %loadPolly -polly-detect-unprofitable -polly-codegen -polly-ignore-aliasing -S -polly-no-early-exit < %s | FileCheck %s +; +; CHECK-LABEL: polly.preload.begin: +; CHECK: %polly.access.A = getelementptr i32**, i32*** %A, i64 42 +; CHECK: %polly.access.A.load = load i32**, i32*** %polly.access.A +; CHECK: %polly.access.polly.access.A.load = getelementptr i32*, i32** %polly.access.A.load, i64 32 +; CHECK: %polly.access.polly.access.A.load.load = load i32*, i32** %polly.access.polly.access.A.load +; +; CHECK: polly.stmt.bb2: +; CHECK: %p_tmp6 = getelementptr inbounds i32, i32* %polly.access.polly.access.A.load.load, i64 %polly.indvar +; CHECK: store i32 0, i32* %p_tmp6, align 4 +; +; void f(int ***A) { +; for (int i = 0; i < 1024; i++) +; A[42][32][i] = 0; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32*** %A) { +bb: + br label %bb1 + +bb1: ; preds = %bb7, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb7 ], [ 0, %bb ] + %exitcond = icmp ne i64 %indvars.iv, 1024 + br i1 %exitcond, label %bb2, label %bb8 + +bb2: ; preds = %bb1 + %tmp = getelementptr inbounds i32**, i32*** %A, i64 42 + %tmp3 = load i32**, i32*** %tmp, align 8 + %tmp4 = getelementptr inbounds i32*, i32** %tmp3, i64 32 + %tmp5 = load i32*, i32** %tmp4, align 8 + %tmp6 = getelementptr inbounds i32, i32* %tmp5, i64 %indvars.iv + store i32 0, i32* %tmp6, align 4 + br label %bb7 + +bb7: ; preds = %bb2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb1 + +bb8: ; preds = %bb1 + ret void +} Index: test/Isl/CodeGen/invariant_load_scalar_dep.ll =================================================================== --- /dev/null +++ test/Isl/CodeGen/invariant_load_scalar_dep.ll @@ -0,0 +1,44 @@ +; RUN: opt %loadPolly -polly-no-early-exit -polly-codegen -polly-ignore-aliasing -polly-detect-unprofitable -S < %s | FileCheck %s +; +; CHECK-LABEL: polly.preload.begin: +; CHECK: %polly.access.B = getelementptr i32, i32* %B, i64 0 +; CHECK: %polly.access.B.load = load i32, i32* %polly.access.B +; +; CHECK-LABEL: polly.stmt.bb2.split: +; CHECK: %scevgep = getelementptr i32, i32* %A, i64 %polly.indvar +; CHECK: store i32 %polly.access.B.load, i32* %scevgep, align 4 +; +; void f(int *restrict A, int *restrict B) { +; for (int i = 0; i < 1024; i++) +; auto tmp = *B; +; // Split BB +; A[i] = tmp; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* noalias %A, i32* noalias %B) { +bb: + br label %bb1 + +bb1: ; preds = %bb4, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb4 ], [ 0, %bb ] + %exitcond = icmp ne i64 %indvars.iv, 1024 + br i1 %exitcond, label %bb2, label %bb5 + +bb2: ; preds = %bb1 + %tmp = load i32, i32* %B, align 4 + br label %bb2.split + +bb2.split: + %tmp3 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 %tmp, i32* %tmp3, align 4 + br label %bb4 + +bb4: ; preds = %bb2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb1 + +bb5: ; preds = %bb1 + ret void +} Index: test/Isl/CodeGen/non-affine-phi-node-expansion.ll =================================================================== --- test/Isl/CodeGen/non-affine-phi-node-expansion.ll +++ test/Isl/CodeGen/non-affine-phi-node-expansion.ll @@ -4,6 +4,11 @@ %struct.wombat = type {[4 x i32]} +; CHECK: polly.preload.begin: +; CHECK-NEXT: %polly.access.B = getelementptr i32, i32* %B, i64 0 +; CHECK-NEXT: %polly.access.B.load = load i32, i32* %polly.access.B +; CHECK-NOT: %polly.access.B.load = load i32, i32* %polly.access.B + ; CHECK: polly.stmt.bb3.entry: ; preds = %polly.start ; CHECK: br label %polly.stmt.bb3 @@ -16,8 +21,7 @@ ; CHECK: br label %polly.stmt.bb13.exit ; CHECK: polly.stmt.bb5: ; preds = %polly.stmt.bb3 -; CHECK: %tmp7_p_scalar_ = load i32, i32* %B, !alias.scope !0, !noalias !2 -; CHECK: store i32 %tmp7_p_scalar_, i32* %polly.access.cast.arg1, !alias.scope !3, !noalias !4 +; CHECK: store i32 %polly.access.B.load, i32* %polly.access.cast.arg2 ; CHECK: br label %polly.stmt.bb13.exit ; Function Attrs: nounwind uwtable Index: test/Isl/CodeGen/phi_in_exit_early_lnt_failure_4.ll =================================================================== --- test/Isl/CodeGen/phi_in_exit_early_lnt_failure_4.ll +++ /dev/null @@ -1,62 +0,0 @@ -; RUN: opt %loadPolly -disable-basicaa -polly-detect-unprofitable -polly-codegen -polly-no-early-exit -S < %s | FileCheck %s -; -; This caused an lnt crash at some point, just verify it will run through and -; produce the PHI node in the exit we are looking for. -; -; CHECK-LABEL: polly.merge_new_and_old: -; CHECK-NEXT: %.merge = phi %struct.ImageParameters.11.35.59.83.107.323.539.755.1019.1043.1187.1235.1355.1379.1403.1427.1499.1571.1667.1739.1835.2051.2123.2339.2387.2843.2867.2891.2915.3587.3803.3826* [ %.final_reload, %polly.stmt.for.end.298 ], [ %13, %for.end.298 ] -; -%struct.ImageParameters.11.35.59.83.107.323.539.755.1019.1043.1187.1235.1355.1379.1403.1427.1499.1571.1667.1739.1835.2051.2123.2339.2387.2843.2867.2891.2915.3587.3803.3826 = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8**, i8**, i32, i32***, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [9 x [16 x [16 x i16]]], [5 x [16 x [16 x i16]]], [9 x [8 x [8 x i16]]], [2 x [4 x [16 x [16 x i16]]]], [16 x [16 x i16]], [16 x [16 x i32]], i32****, i32***, i32***, i32***, i32****, i32****, %struct.Picture.8.32.56.80.104.320.536.752.1016.1040.1184.1232.1352.1376.1400.1424.1496.1568.1664.1736.1832.2048.2120.2336.2384.2840.2864.2888.2912.3584.3800.3823*, %struct.Slice.7.31.55.79.103.319.535.751.1015.1039.1183.1231.1351.1375.1399.1423.1495.1567.1663.1735.1831.2047.2119.2335.2383.2839.2863.2887.2911.3583.3799.3822*, %struct.macroblock.9.33.57.81.105.321.537.753.1017.1041.1185.1233.1353.1377.1401.1425.1497.1569.1665.1737.1833.2049.2121.2337.2385.2841.2865.2889.2913.3585.3801.3824*, i32*, i32*, i32, i32, i32, i32, [4 x [4 x i32]], i32, i32, i32, i32, i32, double, i32, i32, i32, i32, i16******, i16******, i16******, i16******, [15 x i16], i32, i32, i32, i32, i32, i32, i32, i32, [6 x [32 x i32]], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [1 x i32], i32, i32, [2 x i32], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, %struct.DecRefPicMarking_s.10.34.58.82.106.322.538.754.1018.1042.1186.1234.1354.1378.1402.1426.1498.1570.1666.1738.1834.2050.2122.2338.2386.2842.2866.2890.2914.3586.3802.3825*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, double**, double***, i32***, double**, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [3 x [2 x i32]], [2 x i32], i32, i32, i16, i32, i32, i32, i32, i32 } -%struct.Picture.8.32.56.80.104.320.536.752.1016.1040.1184.1232.1352.1376.1400.1424.1496.1568.1664.1736.1832.2048.2120.2336.2384.2840.2864.2888.2912.3584.3800.3823 = type { i32, i32, [100 x %struct.Slice.7.31.55.79.103.319.535.751.1015.1039.1183.1231.1351.1375.1399.1423.1495.1567.1663.1735.1831.2047.2119.2335.2383.2839.2863.2887.2911.3583.3799.3822*], i32, float, float, float } -%struct.Slice.7.31.55.79.103.319.535.751.1015.1039.1183.1231.1351.1375.1399.1423.1495.1567.1663.1735.1831.2047.2119.2335.2383.2839.2863.2887.2911.3583.3799.3822 = type { i32, i32, i32, i32, i32, i32, %struct.datapartition.3.27.51.75.99.315.531.747.1011.1035.1179.1227.1347.1371.1395.1419.1491.1563.1659.1731.1827.2043.2115.2331.2379.2835.2859.2883.2907.3579.3795.3818*, %struct.MotionInfoContexts.5.29.53.77.101.317.533.749.1013.1037.1181.1229.1349.1373.1397.1421.1493.1565.1661.1733.1829.2045.2117.2333.2381.2837.2861.2885.2909.3581.3797.3820*, %struct.TextureInfoContexts.6.30.54.78.102.318.534.750.1014.1038.1182.1230.1350.1374.1398.1422.1494.1566.1662.1734.1830.2046.2118.2334.2382.2838.2862.2886.2910.3582.3798.3821*, i32, i32*, i32*, i32*, i32, i32*, i32*, i32*, i32 (i32)*, [3 x [2 x i32]] } -%struct.datapartition.3.27.51.75.99.315.531.747.1011.1035.1179.1227.1347.1371.1395.1419.1491.1563.1659.1731.1827.2043.2115.2331.2379.2835.2859.2883.2907.3579.3795.3818 = type { %struct.Bitstream.1.25.49.73.97.313.529.745.1009.1033.1177.1225.1345.1369.1393.1417.1489.1561.1657.1729.1825.2041.2113.2329.2377.2833.2857.2881.2905.3577.3793.3816*, %struct.EncodingEnvironment.2.26.50.74.98.314.530.746.1010.1034.1178.1226.1346.1370.1394.1418.1490.1562.1658.1730.1826.2042.2114.2330.2378.2834.2858.2882.2906.3578.3794.3817, %struct.EncodingEnvironment.2.26.50.74.98.314.530.746.1010.1034.1178.1226.1346.1370.1394.1418.1490.1562.1658.1730.1826.2042.2114.2330.2378.2834.2858.2882.2906.3578.3794.3817 } -%struct.Bitstream.1.25.49.73.97.313.529.745.1009.1033.1177.1225.1345.1369.1393.1417.1489.1561.1657.1729.1825.2041.2113.2329.2377.2833.2857.2881.2905.3577.3793.3816 = type { i32, i32, i8, i32, i32, i8, i8, i32, i32, i8*, i32 } -%struct.EncodingEnvironment.2.26.50.74.98.314.530.746.1010.1034.1178.1226.1346.1370.1394.1418.1490.1562.1658.1730.1826.2042.2114.2330.2378.2834.2858.2882.2906.3578.3794.3817 = type { i32, i32, i32, i32, i32, i8*, i32*, i32, i32 } -%struct.MotionInfoContexts.5.29.53.77.101.317.533.749.1013.1037.1181.1229.1349.1373.1397.1421.1493.1565.1661.1733.1829.2045.2117.2333.2381.2837.2861.2885.2909.3581.3797.3820 = type { [3 x [11 x %struct.BiContextType.4.28.52.76.100.316.532.748.1012.1036.1180.1228.1348.1372.1396.1420.1492.1564.1660.1732.1828.2044.2116.2332.2380.2836.2860.2884.2908.3580.3796.3819]], [2 x [9 x %struct.BiContextType.4.28.52.76.100.316.532.748.1012.1036.1180.1228.1348.1372.1396.1420.1492.1564.1660.1732.1828.2044.2116.2332.2380.2836.2860.2884.2908.3580.3796.3819]], [2 x [10 x %struct.BiContextType.4.28.52.76.100.316.532.748.1012.1036.1180.1228.1348.1372.1396.1420.1492.1564.1660.1732.1828.2044.2116.2332.2380.2836.2860.2884.2908.3580.3796.3819]], [2 x [6 x %struct.BiContextType.4.28.52.76.100.316.532.748.1012.1036.1180.1228.1348.1372.1396.1420.1492.1564.1660.1732.1828.2044.2116.2332.2380.2836.2860.2884.2908.3580.3796.3819]], [4 x %struct.BiContextType.4.28.52.76.100.316.532.748.1012.1036.1180.1228.1348.1372.1396.1420.1492.1564.1660.1732.1828.2044.2116.2332.2380.2836.2860.2884.2908.3580.3796.3819], [4 x %struct.BiContextType.4.28.52.76.100.316.532.748.1012.1036.1180.1228.1348.1372.1396.1420.1492.1564.1660.1732.1828.2044.2116.2332.2380.2836.2860.2884.2908.3580.3796.3819], [3 x %struct.BiContextType.4.28.52.76.100.316.532.748.1012.1036.1180.1228.1348.1372.1396.1420.1492.1564.1660.1732.1828.2044.2116.2332.2380.2836.2860.2884.2908.3580.3796.3819] } -%struct.BiContextType.4.28.52.76.100.316.532.748.1012.1036.1180.1228.1348.1372.1396.1420.1492.1564.1660.1732.1828.2044.2116.2332.2380.2836.2860.2884.2908.3580.3796.3819 = type { i16, i8, i64 } -%struct.TextureInfoContexts.6.30.54.78.102.318.534.750.1014.1038.1182.1230.1350.1374.1398.1422.1494.1566.1662.1734.1830.2046.2118.2334.2382.2838.2862.2886.2910.3582.3798.3821 = type { [2 x %struct.BiContextType.4.28.52.76.100.316.532.748.1012.1036.1180.1228.1348.1372.1396.1420.1492.1564.1660.1732.1828.2044.2116.2332.2380.2836.2860.2884.2908.3580.3796.3819], [4 x %struct.BiContextType.4.28.52.76.100.316.532.748.1012.1036.1180.1228.1348.1372.1396.1420.1492.1564.1660.1732.1828.2044.2116.2332.2380.2836.2860.2884.2908.3580.3796.3819], [3 x [4 x %struct.BiContextType.4.28.52.76.100.316.532.748.1012.1036.1180.1228.1348.1372.1396.1420.1492.1564.1660.1732.1828.2044.2116.2332.2380.2836.2860.2884.2908.3580.3796.3819]], [10 x [4 x %struct.BiContextType.4.28.52.76.100.316.532.748.1012.1036.1180.1228.1348.1372.1396.1420.1492.1564.1660.1732.1828.2044.2116.2332.2380.2836.2860.2884.2908.3580.3796.3819]], [10 x [15 x %struct.BiContextType.4.28.52.76.100.316.532.748.1012.1036.1180.1228.1348.1372.1396.1420.1492.1564.1660.1732.1828.2044.2116.2332.2380.2836.2860.2884.2908.3580.3796.3819]], [10 x [15 x %struct.BiContextType.4.28.52.76.100.316.532.748.1012.1036.1180.1228.1348.1372.1396.1420.1492.1564.1660.1732.1828.2044.2116.2332.2380.2836.2860.2884.2908.3580.3796.3819]], [10 x [5 x %struct.BiContextType.4.28.52.76.100.316.532.748.1012.1036.1180.1228.1348.1372.1396.1420.1492.1564.1660.1732.1828.2044.2116.2332.2380.2836.2860.2884.2908.3580.3796.3819]], [10 x [5 x %struct.BiContextType.4.28.52.76.100.316.532.748.1012.1036.1180.1228.1348.1372.1396.1420.1492.1564.1660.1732.1828.2044.2116.2332.2380.2836.2860.2884.2908.3580.3796.3819]], [10 x [15 x %struct.BiContextType.4.28.52.76.100.316.532.748.1012.1036.1180.1228.1348.1372.1396.1420.1492.1564.1660.1732.1828.2044.2116.2332.2380.2836.2860.2884.2908.3580.3796.3819]], [10 x [15 x %struct.BiContextType.4.28.52.76.100.316.532.748.1012.1036.1180.1228.1348.1372.1396.1420.1492.1564.1660.1732.1828.2044.2116.2332.2380.2836.2860.2884.2908.3580.3796.3819]] } -%struct.macroblock.9.33.57.81.105.321.537.753.1017.1041.1185.1233.1353.1377.1401.1425.1497.1569.1665.1737.1833.2049.2121.2337.2385.2841.2865.2889.2913.3585.3801.3824 = type { i32, i32, i32, [2 x i32], i32, [8 x i32], %struct.macroblock.9.33.57.81.105.321.537.753.1017.1041.1185.1233.1353.1377.1401.1425.1497.1569.1665.1737.1833.2049.2121.2337.2385.2841.2865.2889.2913.3585.3801.3824*, %struct.macroblock.9.33.57.81.105.321.537.753.1017.1041.1185.1233.1353.1377.1401.1425.1497.1569.1665.1737.1833.2049.2121.2337.2385.2841.2865.2889.2913.3585.3801.3824*, i32, [2 x [4 x [4 x [2 x i32]]]], [16 x i8], [16 x i8], i32, i64, [4 x i32], [4 x i32], i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i16, double, i32, i32, i32, i32, i32, i32, i32, i32, i32 } -%struct.DecRefPicMarking_s.10.34.58.82.106.322.538.754.1018.1042.1186.1234.1354.1378.1402.1426.1498.1570.1666.1738.1834.2050.2122.2338.2386.2842.2866.2890.2914.3586.3802.3825 = type { i32, i32, i32, i32, i32, %struct.DecRefPicMarking_s.10.34.58.82.106.322.538.754.1018.1042.1186.1234.1354.1378.1402.1426.1498.1570.1666.1738.1834.2050.2122.2338.2386.2842.2866.2890.2914.3586.3802.3825* } - -@img = external global %struct.ImageParameters.11.35.59.83.107.323.539.755.1019.1043.1187.1235.1355.1379.1403.1427.1499.1571.1667.1739.1835.2051.2123.2339.2387.2843.2867.2891.2915.3587.3803.3826*, align 8 - -; Function Attrs: nounwind uwtable -define void @intrapred_luma() #0 { -entry: - %PredPel = alloca [13 x i16], align 16 - br label %for.body - -for.body: ; preds = %for.body, %entry - br i1 undef, label %for.body, label %for.body.262 - -for.body.262: ; preds = %for.body - %0 = load %struct.ImageParameters.11.35.59.83.107.323.539.755.1019.1043.1187.1235.1355.1379.1403.1427.1499.1571.1667.1739.1835.2051.2123.2339.2387.2843.2867.2891.2915.3587.3803.3826*, %struct.ImageParameters.11.35.59.83.107.323.539.755.1019.1043.1187.1235.1355.1379.1403.1427.1499.1571.1667.1739.1835.2051.2123.2339.2387.2843.2867.2891.2915.3587.3803.3826** @img, align 8 - br label %for.body.280 - -for.body.280: ; preds = %for.body.280, %for.body.262 - %indvars.iv66 = phi i64 [ 0, %for.body.262 ], [ %indvars.iv.next67, %for.body.280 ] - %arrayidx282 = getelementptr inbounds [13 x i16], [13 x i16]* %PredPel, i64 0, i64 1 - %arrayidx283 = getelementptr inbounds i16, i16* %arrayidx282, i64 %indvars.iv66 - %1 = load i16, i16* %arrayidx283, align 2 - %arrayidx289 = getelementptr inbounds %struct.ImageParameters.11.35.59.83.107.323.539.755.1019.1043.1187.1235.1355.1379.1403.1427.1499.1571.1667.1739.1835.2051.2123.2339.2387.2843.2867.2891.2915.3587.3803.3826, %struct.ImageParameters.11.35.59.83.107.323.539.755.1019.1043.1187.1235.1355.1379.1403.1427.1499.1571.1667.1739.1835.2051.2123.2339.2387.2843.2867.2891.2915.3587.3803.3826* %0, i64 0, i32 47, i64 0, i64 2, i64 %indvars.iv66 - store i16 %1, i16* %arrayidx289, align 2 - %indvars.iv.next67 = add nuw nsw i64 %indvars.iv66, 1 - br i1 false, label %for.body.280, label %for.end.298 - -for.end.298: ; preds = %for.body.280 - %2 = load %struct.ImageParameters.11.35.59.83.107.323.539.755.1019.1043.1187.1235.1355.1379.1403.1427.1499.1571.1667.1739.1835.2051.2123.2339.2387.2843.2867.2891.2915.3587.3803.3826*, %struct.ImageParameters.11.35.59.83.107.323.539.755.1019.1043.1187.1235.1355.1379.1403.1427.1499.1571.1667.1739.1835.2051.2123.2339.2387.2843.2867.2891.2915.3587.3803.3826** @img, align 8 - br label %for.body.310 - -for.body.310: ; preds = %for.body.310, %for.end.298 - %indvars.iv = phi i64 [ 0, %for.end.298 ], [ %indvars.iv.next, %for.body.310 ] - %arrayidx312 = getelementptr inbounds [13 x i16], [13 x i16]* %PredPel, i64 0, i64 9 - %arrayidx313 = getelementptr inbounds i16, i16* %arrayidx312, i64 %indvars.iv - %3 = load i16, i16* %arrayidx313, align 2 - %arrayidx322 = getelementptr inbounds %struct.ImageParameters.11.35.59.83.107.323.539.755.1019.1043.1187.1235.1355.1379.1403.1427.1499.1571.1667.1739.1835.2051.2123.2339.2387.2843.2867.2891.2915.3587.3803.3826, %struct.ImageParameters.11.35.59.83.107.323.539.755.1019.1043.1187.1235.1355.1379.1403.1427.1499.1571.1667.1739.1835.2051.2123.2339.2387.2843.2867.2891.2915.3587.3803.3826* %2, i64 0, i32 47, i64 1, i64 %indvars.iv, i64 1 - store i16 %3, i16* %arrayidx322, align 2 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - br i1 false, label %for.body.310, label %for.end.328 - -for.end.328: ; preds = %for.body.310 - ret void -} Index: test/Isl/CodeGen/simple_vec_call.ll =================================================================== --- test/Isl/CodeGen/simple_vec_call.ll +++ test/Isl/CodeGen/simple_vec_call.ll @@ -24,16 +24,10 @@ ret void } -; CHECK: %value_p_splat_one = load <1 x float>, <1 x float>* bitcast ([1024 x float]* @A to <1 x float>*), align 8 -; CHECK: %value_p_splat = shufflevector <1 x float> %value_p_splat_one, <1 x float> %value_p_splat_one, <4 x i32> zeroinitializer -; CHECK: %0 = extractelement <4 x float> %value_p_splat, i32 0 -; CHECK: %1 = extractelement <4 x float> %value_p_splat, i32 1 -; CHECK: %2 = extractelement <4 x float> %value_p_splat, i32 2 -; CHECK: %3 = extractelement <4 x float> %value_p_splat, i32 3 -; CHECK: [[RES1:%[a-zA-Z0-9_]+]] = tail call float @foo(float %0) [[NUW:#[0-9]+]] -; CHECK: [[RES2:%[a-zA-Z0-9_]+]] = tail call float @foo(float %1) [[NUW]] -; CHECK: [[RES3:%[a-zA-Z0-9_]+]] = tail call float @foo(float %2) [[NUW]] -; CHECK: [[RES4:%[a-zA-Z0-9_]+]] = tail call float @foo(float %3) [[NUW]] +; CHECK: [[RES1:%[a-zA-Z0-9_]+]] = tail call float @foo(float %.load) [[NUW:#[0-9]+]] +; CHECK: [[RES2:%[a-zA-Z0-9_]+]] = tail call float @foo(float %.load) [[NUW]] +; CHECK: [[RES3:%[a-zA-Z0-9_]+]] = tail call float @foo(float %.load) [[NUW]] +; CHECK: [[RES4:%[a-zA-Z0-9_]+]] = tail call float @foo(float %.load) [[NUW]] ; CHECK: %4 = insertelement <4 x float> undef, float [[RES1]], i32 0 ; CHECK: %5 = insertelement <4 x float> %4, float [[RES2]], i32 1 ; CHECK: %6 = insertelement <4 x float> %5, float [[RES3]], i32 2 Index: test/Isl/CodeGen/simple_vec_call_2.ll =================================================================== --- test/Isl/CodeGen/simple_vec_call_2.ll +++ test/Isl/CodeGen/simple_vec_call_2.ll @@ -24,19 +24,13 @@ ret void } -; CHECK: %value_p_splat_one = load <1 x float>, <1 x float>* bitcast ([1024 x float]* @A to <1 x float>*), align 8 -; CHECK: %value_p_splat = shufflevector <1 x float> %value_p_splat_one, <1 x float> %value_p_splat_one, <4 x i32> zeroinitializer -; CHECK: %0 = extractelement <4 x float> %value_p_splat, i32 0 -; CHECK: %1 = extractelement <4 x float> %value_p_splat, i32 1 -; CHECK: %2 = extractelement <4 x float> %value_p_splat, i32 2 -; CHECK: %3 = extractelement <4 x float> %value_p_splat, i32 3 -; CHECK: [[RES1:%[a-zA-Z0-9_]+]] = tail call float** @foo(float %0) [[NUW:#[0-9]+]] -; CHECK: [[RES2:%[a-zA-Z0-9_]+]] = tail call float** @foo(float %1) [[NUW]] -; CHECK: [[RES3:%[a-zA-Z0-9_]+]] = tail call float** @foo(float %2) [[NUW]] -; CHECK: [[RES4:%[a-zA-Z0-9_]+]] = tail call float** @foo(float %3) [[NUW]] -; CHECK: %4 = insertelement <4 x float**> undef, float** %p_result, i32 0 -; CHECK: %5 = insertelement <4 x float**> %4, float** %p_result1, i32 1 -; CHECK: %6 = insertelement <4 x float**> %5, float** %p_result2, i32 2 -; CHECK: %7 = insertelement <4 x float**> %6, float** %p_result3, i32 3 -; CHECK: store <4 x float**> %7, <4 x float**>* bitcast ([1024 x float**]* @B to <4 x float**>*), align +; CHECK: [[RES1:%[a-zA-Z0-9_]+]] = tail call float** @foo(float %.load) [[NUW:#[0-9]+]] +; CHECK: [[RES2:%[a-zA-Z0-9_]+]] = tail call float** @foo(float %.load) [[NUW]] +; CHECK: [[RES3:%[a-zA-Z0-9_]+]] = tail call float** @foo(float %.load) [[NUW]] +; CHECK: [[RES4:%[a-zA-Z0-9_]+]] = tail call float** @foo(float %.load) [[NUW]] +; CHECK: %0 = insertelement <4 x float**> undef, float** %p_result, i32 0 +; CHECK: %1 = insertelement <4 x float**> %0, float** %p_result1, i32 1 +; CHECK: %2 = insertelement <4 x float**> %1, float** %p_result2, i32 2 +; CHECK: %3 = insertelement <4 x float**> %2, float** %p_result3, i32 3 +; CHECK: store <4 x float**> %3, <4 x float**>* bitcast ([1024 x float**]* @B to <4 x float**>*), align ; CHECK: attributes [[NUW]] = { nounwind } Index: test/Isl/CodeGen/simple_vec_cast.ll =================================================================== --- test/Isl/CodeGen/simple_vec_cast.ll +++ test/Isl/CodeGen/simple_vec_cast.ll @@ -28,8 +28,10 @@ ret void } -; CHECK: %tmp_p_splat_one = load <1 x float>, <1 x float>* bitcast ([1024 x float]* @A to <1 x float>*), align 8, !alias.scope !0, !noalias !2 -; CHECK: %tmp_p_splat = shufflevector <1 x float> %tmp_p_splat_one, <1 x float> %tmp_p_splat_one, <4 x i32> zeroinitializer -; CHECK: %0 = fpext <4 x float> %tmp_p_splat to <4 x double> -; CHECK: store <4 x double> %0, <4 x double>* bitcast ([1024 x double]* @B to <4 x double>*), align 8, !alias.scope !3, !noalias !4 +; CHECK: %.load = load float, float* getelementptr inbounds ([1024 x float], [1024 x float]* @A, i32 0, i32 0) +; CHECK: polly.stmt.bb2: ; preds = %polly.start +; CHECK: %tmp_p.splatinsert = insertelement <4 x float> undef, float %.load, i32 0 +; CHECK: %tmp_p.splat = shufflevector <4 x float> %tmp_p.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer +; CHECK: %0 = fpext <4 x float> %tmp_p.splat to <4 x double> +; CHECK: store <4 x double> %0, <4 x double>* Index: test/Isl/CodeGen/simple_vec_const.ll =================================================================== --- test/Isl/CodeGen/simple_vec_const.ll +++ test/Isl/CodeGen/simple_vec_const.ll @@ -52,5 +52,8 @@ } -; CHECK: load <1 x float>, <1 x float>* bitcast ([1024 x float]* @A to <1 x float>*) -; CHECK: shufflevector <1 x float> {{.*}}, <1 x float> {{.*}} <4 x i32> zeroinitializer +; CHECK: %.load = load float, float* getelementptr inbounds ([1024 x float], [1024 x float]* @A, i32 0, i32 0) + +; CHECK: polly.stmt.: ; preds = %polly.start +; CHECK: %_p.splatinsert = insertelement <4 x float> undef, float %.load, i32 0 +; CHECK: %_p.splat = shufflevector <4 x float> %_p.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer Index: test/Isl/CodeGen/simple_vec_ptr_ptr_ty.ll =================================================================== --- test/Isl/CodeGen/simple_vec_ptr_ptr_ty.ll +++ test/Isl/CodeGen/simple_vec_ptr_ptr_ty.ll @@ -22,6 +22,9 @@ return: ret void } -; CHECK: %value_p_splat_one = load <1 x float**>, <1 x float**>* bitcast ([1024 x float**]* @A to <1 x float**>*), align 8 -; CHECK: %value_p_splat = shufflevector <1 x float**> %value_p_splat_one, <1 x float**> %value_p_splat_one, <4 x i32> zeroinitializer -; CHECK: store <4 x float**> %value_p_splat, <4 x float**>* bitcast ([1024 x float**]* @B to <4 x float**>*), align 8 +; CHECK: %.load = load float**, float*** getelementptr inbounds ([1024 x float**], [1024 x float**]* @A, i32 0, i32 0) + +; CHECK-NOT: load <1 x float**> +; CHECK: %value_p.splatinsert = insertelement <4 x float**> undef, float** %.load, i32 0 +; CHECK: %value_p.splat = shufflevector <4 x float**> %value_p.splatinsert, <4 x float**> undef, <4 x i32> zeroinitializer +; CHECK: store <4 x float**> %value_p.splat, <4 x float**>* bitcast ([1024 x float**]* @B to <4 x float**>*), align 8 Index: test/Isl/CodeGen/two-scops-in-row.ll =================================================================== --- test/Isl/CodeGen/two-scops-in-row.ll +++ test/Isl/CodeGen/two-scops-in-row.ll @@ -21,6 +21,7 @@ for.0: %Scalar0.val = load i32, i32* %Scalar0 + store i32 1, i32* %Scalar0 br i1 false, label %for.0, label %for.1.preheader for.1.preheader: Index: test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll =================================================================== --- test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll +++ test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll @@ -6,7 +6,7 @@ ; ; void a(struct b *A) { ; for (int i=0; i<32; i++) -; A->b[i] = 0; +; A[i].b[i] = 0; ; } ; CHECK: remark: ReportVariantBasePtr01.c:6:8: The following errors keep this region from being a Scop. @@ -23,11 +23,11 @@ entry.split: ; preds = %entry tail call void @llvm.dbg.value(metadata %struct.b* %A, i64 0, metadata !16, metadata !DIExpression()), !dbg !23 tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !17, metadata !DIExpression()), !dbg !25 - %b = getelementptr inbounds %struct.b, %struct.b* %A, i64 0, i32 0, !dbg !26 br label %for.body, !dbg !27 for.body: ; preds = %for.body, %entry.split %indvar4 = phi i64 [ %indvar.next, %for.body ], [ 0, %entry.split ] + %b = getelementptr inbounds %struct.b, %struct.b* %A, i64 %indvar4, i32 0, !dbg !26 %0 = mul i64 %indvar4, 4, !dbg !26 %1 = add i64 %0, 3, !dbg !26 %2 = add i64 %0, 2, !dbg !26 Index: test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_1.ll =================================================================== --- test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_1.ll +++ test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_1.ll @@ -31,8 +31,10 @@ ; void f(int * restrict A, int * restrict C) { ; int j; ; for (int i = 0; i < 1024; i++) { -; while ((j = C[i])) +; while ((j = C[i++])) { ; A[j]++; +; if (true) break; +; } ; } ; } ; @@ -62,7 +64,7 @@ %tmp9 = load i32, i32* %tmp8, align 4 %tmp10 = add nsw i32 %tmp9, 1 store i32 %tmp10, i32* %tmp8, align 4 - br label %bb3 + br i1 true, label %bb11, label %bb3 bb11: ; preds = %bb3 br label %bb12 Index: test/ScopInfo/inter_bb_scalar_dep.ll =================================================================== --- test/ScopInfo/inter_bb_scalar_dep.ll +++ test/ScopInfo/inter_bb_scalar_dep.ll @@ -14,6 +14,10 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" ; Function Attrs: nounwind +; CHECK: Invariant +; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: MemRef_init_ptr[0] + define void @f(i64* noalias %A, i64 %N, i64* noalias %init_ptr) #0 { entry: br label %for.i @@ -25,11 +29,7 @@ entry.next: ; preds = %for.i %init = load i64, i64* %init_ptr -; CHECK-LABEL: Stmt_entry_next -; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; CHECK-NEXT: [N] -> { Stmt_entry_next[i0] -> MemRef_init_ptr[0] }; -; CHECK: MustWriteAccess := [Reduction Type: NONE] [Scalar: 1] -; CHECK-NEXT: [N] -> { Stmt_entry_next[i0] -> MemRef_init[] }; +; CHECK-NOT: Stmt_entry_next br label %for.j for.j: ; preds = %for.j, %entry.next Index: test/ScopInfo/intra_and_inter_bb_scalar_dep.ll =================================================================== --- test/ScopInfo/intra_and_inter_bb_scalar_dep.ll +++ test/ScopInfo/intra_and_inter_bb_scalar_dep.ll @@ -14,7 +14,12 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" -; Function Attrs: nounwind +; CHECK: Invariant Accesses: { +; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK: MemRef_init_ptr[0] +; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK: MemRef_init_ptr[0] +; CHECK: } define void @f(i64* noalias %A, i64 %N, i64* noalias %init_ptr) #0 { entry: br label %for.i @@ -26,23 +31,17 @@ entry.next: ; preds = %for.i %init = load i64, i64* %init_ptr -; CHECK-LABEL: Stmt_entry_next -; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; CHECK-NEXT: [N] -> { Stmt_entry_next[i0] -> MemRef_init_ptr[0] }; -; CHECK: MustWriteAccess := [Reduction Type: NONE] [Scalar: 1] -; CHECK-NEXT: [N] -> { Stmt_entry_next[i0] -> MemRef_init[] }; +; CHECK-NOT: Stmt_entry_next br label %for.j for.j: ; preds = %for.j, %entry.next %indvar.j = phi i64 [ 0, %entry.next ], [ %indvar.j.next, %for.j ] %init_2 = load i64, i64* %init_ptr %init_sum = add i64 %init, %init_2 -; CHECK-LABEL: Stmt_for_j +; CHECK: Stmt_for_j ; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 1] ; CHECK-NEXT: [N] -> { Stmt_for_j[i0, i1] -> MemRef_init[] }; -; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; CHECK-NEXT: [N] -> { Stmt_for_j[i0, i1] -> MemRef_init_ptr[0] }; -; CHECK: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0] ; CHECK-NEXT: [N] -> { Stmt_for_j[i0, i1] -> MemRef_A[i1] }; %scevgep = getelementptr i64, i64* %A, i64 %indvar.j store i64 %init_sum, i64* %scevgep Index: test/ScopInfo/intra_bb_scalar_dep.ll =================================================================== --- test/ScopInfo/intra_bb_scalar_dep.ll +++ test/ScopInfo/intra_bb_scalar_dep.ll @@ -14,6 +14,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" ; Function Attrs: nounwind +; CHECK: Invariant Accesses: +; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: [N] -> { Stmt_for_j[i0, i1] -> MemRef_init_ptr[0] }; define void @f(i64* noalias %A, i64 %N, i64* noalias %init_ptr) #0 { entry: br label %for.i @@ -32,11 +35,12 @@ %init_plus_two = add i64 %init, 2 %scevgep = getelementptr i64, i64* %A, i64 %indvar.j store i64 %init_plus_two, i64* %scevgep -; CHECK-LABEL: Stmt_for_j -; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; CHECK-NEXT: [N] -> { Stmt_for_j[i0, i1] -> MemRef_init_ptr[0] }; -; CHECK: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0] -; CHECK-NEXT: [N] -> { Stmt_for_j[i0, i1] -> MemRef_A[i1] }; +; CHECK: Statements { +; CHECK-NEXT: Stmt_for_j +; CHECK-NOT: ReadAccess +; CHECK: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: [N] -> { Stmt_for_j[i0, i1] -> MemRef_A[i1] }; +; CHECK-NEXT: } %indvar.j.next = add nsw i64 %indvar.j, 1 %exitcond.j = icmp eq i64 %indvar.j.next, %N br i1 %exitcond.j, label %for.i.end, label %for.j Index: test/ScopInfo/invariant_load.ll =================================================================== --- /dev/null +++ test/ScopInfo/invariant_load.ll @@ -0,0 +1,35 @@ +; RUN: opt %loadPolly -polly-detect-unprofitable -polly-scops -analyze < %s | FileCheck %s +; +; CHECK: Invariant Accesses: +; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: { Stmt_bb2[i0] -> MemRef_B[0] }; +; +; void f(int *restrict A, int *restrict B) { +; for (int i = 0; i < 1024; i++) +; A[i] = *B; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* noalias %A, i32* noalias %B) { +bb: + br label %bb1 + +bb1: ; preds = %bb4, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb4 ], [ 0, %bb ] + %exitcond = icmp ne i64 %indvars.iv, 1024 + br i1 %exitcond, label %bb2, label %bb5 + +bb2: ; preds = %bb1 + %tmp = load i32, i32* %B, align 4 + %tmp3 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 %tmp, i32* %tmp3, align 4 + br label %bb4 + +bb4: ; preds = %bb2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb1 + +bb5: ; preds = %bb1 + ret void +} Index: test/ScopInfo/invariant_load_base_pointer.ll =================================================================== --- /dev/null +++ test/ScopInfo/invariant_load_base_pointer.ll @@ -0,0 +1,35 @@ +; RUN: opt %loadPolly -polly-scops -polly-ignore-aliasing -polly-detect-unprofitable -analyze < %s | FileCheck %s +; +; CHECK: Invariant Accesses: +; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: { Stmt_bb2[i0] -> MemRef_BPLoc[0] }; +; +; void f(int **BPLoc) { +; for (int i = 0; i < 1024; i++) +; (*BPLoc)[i] = 0; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32** %BPLoc) { +bb: + br label %bb1 + +bb1: ; preds = %bb4, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb4 ], [ 0, %bb ] + %exitcond = icmp ne i64 %indvars.iv, 1024 + br i1 %exitcond, label %bb2, label %bb5 + +bb2: ; preds = %bb1 + %tmp = load i32*, i32** %BPLoc, align 8 + %tmp3 = getelementptr inbounds i32, i32* %tmp, i64 %indvars.iv + store i32 0, i32* %tmp3, align 4 + br label %bb4 + +bb4: ; preds = %bb2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb1 + +bb5: ; preds = %bb1 + ret void +} Index: test/ScopInfo/invariant_load_base_pointer_conditional.ll =================================================================== --- /dev/null +++ test/ScopInfo/invariant_load_base_pointer_conditional.ll @@ -0,0 +1,51 @@ +; RUN: opt %loadPolly -polly-scops -polly-ignore-aliasing -polly-detect-unprofitable -analyze < %s | FileCheck %s +; +; CHECK: Invariant Accesses: +; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: [N] -> { Stmt_bb5[i0] -> MemRef_BPLoc[0] }; +; +; void f(int **BPLoc, int *A, int N) { +; for (int i = 0; i < N; i++) +; if (i > 512) +; (*BPLoc)[i] = 0; +; else +; A[i] = 0; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32** %BPLoc, i32* %A, i32 %N) { +bb: + %tmp = sext i32 %N to i64 + br label %bb1 + +bb1: ; preds = %bb11, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb11 ], [ 0, %bb ] + %tmp2 = icmp slt i64 %indvars.iv, %tmp + br i1 %tmp2, label %bb3, label %bb12 + +bb3: ; preds = %bb1 + %tmp4 = icmp sgt i64 %indvars.iv, 512 + br i1 %tmp4, label %bb5, label %bb8 + +bb5: ; preds = %bb3 + %tmp6 = load i32*, i32** %BPLoc, align 8 + %tmp7 = getelementptr inbounds i32, i32* %tmp6, i64 %indvars.iv + store i32 0, i32* %tmp7, align 4 + br label %bb10 + +bb8: ; preds = %bb3 + %tmp9 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 0, i32* %tmp9, align 4 + br label %bb10 + +bb10: ; preds = %bb8, %bb5 + br label %bb11 + +bb11: ; preds = %bb10 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb1 + +bb12: ; preds = %bb1 + ret void +} Index: test/ScopInfo/invariant_load_condition.ll =================================================================== --- /dev/null +++ test/ScopInfo/invariant_load_condition.ll @@ -0,0 +1,43 @@ +; RUN: opt %loadPolly -polly-detect-unprofitable -polly-scops -analyze < %s | FileCheck %s +; +; CHECK: Invariant Accesses: +; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: { Stmt_bb2__TO__bb6[i0] -> MemRef_C[0] }; +; +; void f(int *A, int *C) { +; for (int i = 0; i < 1024; i++) +; if (*C) +; A[i] = 0; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* %A, i32* %C) { +bb: + br label %bb1 + +bb1: ; preds = %bb7, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb7 ], [ 0, %bb ] + %exitcond = icmp ne i64 %indvars.iv, 1024 + br i1 %exitcond, label %bb2, label %bb8 + +bb2: ; preds = %bb1 + %tmp = load i32, i32* %C, align 4 + %tmp3 = icmp eq i32 %tmp, 0 + br i1 %tmp3, label %bb6, label %bb4 + +bb4: ; preds = %bb2 + %tmp5 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 0, i32* %tmp5, align 4 + br label %bb6 + +bb6: ; preds = %bb2, %bb4 + br label %bb7 + +bb7: ; preds = %bb6 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb1 + +bb8: ; preds = %bb1 + ret void +} Index: test/ScopInfo/invariant_load_loop_ub.ll =================================================================== --- /dev/null +++ test/ScopInfo/invariant_load_loop_ub.ll @@ -0,0 +1,42 @@ +; RUN: opt %loadPolly -polly-scops -polly-detect-unprofitable -analyze < %s | FileCheck %s +; +; TODO: This is a negative test as we cannot handle loops with a +; trip count ScalarEvolution cannot handle. However, once +; this changes we should be able to detect this. +; +; CHECK-NOT: Context +; +; __CHECK: Invariant Accesses: +; __CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; __CHECK-NEXT: { Stmt_bb1[i0] -> MemRef_UB[0] }; +; +; void f(int *A, int *UB) { +; for (int i = 0; i < *UB; i++) +; A[i] = 0; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* %A, i32* %UB) { +bb: + br label %bb1 + +bb1: ; preds = %bb6, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb6 ], [ 0, %bb ] + %tmp = load i32, i32* %UB, align 4 + %tmp2 = sext i32 %tmp to i64 + %tmp3 = icmp slt i64 %indvars.iv, %tmp2 + br i1 %tmp3, label %bb4, label %bb7 + +bb4: ; preds = %bb1 + %tmp5 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 0, i32* %tmp5, align 4 + br label %bb6 + +bb6: ; preds = %bb4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb1 + +bb7: ; preds = %bb1 + ret void +} Index: test/ScopInfo/invariant_load_ptr_ptr_noalias.ll =================================================================== --- test/ScopInfo/invariant_load_ptr_ptr_noalias.ll +++ test/ScopInfo/invariant_load_ptr_ptr_noalias.ll @@ -1,6 +1,15 @@ ; RUN: opt %loadPolly -tbaa -polly-scops -polly-ignore-aliasing \ ; RUN: -polly-detect-unprofitable -analyze < %s | FileCheck %s ; +; Note: The order of the invariant accesses is important! +; +; CHECK: Invariant Accesses: { +; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK: MemRef_A[42] +; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK: MemRef_tmp3[32] +; CHECK: } +; ; CHECK: Arrays { ; CHECK: i32** MemRef_A[*][8] ; CHECK: i32* MemRef_tmp3[*][8] [BasePtrOrigin: MemRef_A] @@ -31,11 +40,11 @@ bb2: ; preds = %bb1 %tmp = getelementptr inbounds i32**, i32*** %A, i64 42 - %tmp3 = load i32**, i32*** %tmp, align 8, !tbaa !1 + %tmp3 = load i32**, i32*** %tmp, align 8 %tmp4 = getelementptr inbounds i32*, i32** %tmp3, i64 32 - %tmp5 = load i32*, i32** %tmp4, align 8, !tbaa !1 + %tmp5 = load i32*, i32** %tmp4, align 8 %tmp6 = getelementptr inbounds i32, i32* %tmp5, i64 %indvars.iv - store i32 0, i32* %tmp6, align 4, !tbaa !5 + store i32 0, i32* %tmp6, align 4 br label %bb7 bb7: ; preds = %bb2 @@ -45,11 +54,3 @@ bb8: ; preds = %bb1 ret void } - -!0 = !{!"clang version 3.8.0 (http://llvm.org/git/clang.git 9e282ff441e7a367dc711e41fd19d27ffc0f78d6)"} -!1 = !{!2, !2, i64 0} -!2 = !{!"any pointer", !3, i64 0} -!3 = !{!"omnipotent char", !4, i64 0} -!4 = !{!"Simple C/C++ TBAA"} -!5 = !{!6, !6, i64 0} -!6 = !{!"int", !3, i64 0} Index: test/ScopInfo/invariant_load_scalar_dep.ll =================================================================== --- /dev/null +++ test/ScopInfo/invariant_load_scalar_dep.ll @@ -0,0 +1,42 @@ +; RUN: opt %loadPolly -polly-detect-unprofitable -polly-scops -analyze < %s | FileCheck %s +; +; CHECK: Invariant Accesses: +; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: { Stmt_bb2[i0] -> MemRef_B[0] }; +; CHECK-NOT: MustWriteAccess := [Reduction Type: NONE] [Scalar: 1] +; CHECK-NOT: { Stmt_bb2[i0] -> MemRef_tmp[] }; +; +; void f(int *restrict A, int *restrict B) { +; for (int i = 0; i < 1024; i++) +; auto tmp = *B; +; // Split BB +; A[i] = tmp; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* noalias %A, i32* noalias %B) { +bb: + br label %bb1 + +bb1: ; preds = %bb4, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb4 ], [ 0, %bb ] + %exitcond = icmp ne i64 %indvars.iv, 1024 + br i1 %exitcond, label %bb2, label %bb5 + +bb2: ; preds = %bb1 + %tmp = load i32, i32* %B, align 4 + br label %bb2b + +bb2b: + %tmp3 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 %tmp, i32* %tmp3, align 4 + br label %bb4 + +bb4: ; preds = %bb2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb1 + +bb5: ; preds = %bb1 + ret void +} Index: test/ScopInfo/invariant_loop_bounds.ll =================================================================== --- /dev/null +++ test/ScopInfo/invariant_loop_bounds.ll @@ -0,0 +1,105 @@ +; RUN: opt %loadPolly -polly-scops -analyze < %s | FileCheck %s +; +; CHECK: Invariant Accesses: { +; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK: MemRef_bounds[2] } +; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK: MemRef_bounds[1] : tmp >= 1 } +; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK: MemRef_bounds[0] : tmp8 >= 1 and tmp >= 1 } +; CHECK: } +; +; CHECK: p0: %tmp +; CHECK: p1: %tmp8 +; CHECK: p2: %tmp10 +; CHECK: Statements { +; CHECK: Stmt_for_body_6 +; CHECK: Domain := +; CHECK: [tmp, tmp8, tmp10] -> { Stmt_for_body_6[i0, i1, i2] : i0 >= 0 and i0 <= -1 + tmp and i1 >= 0 and i1 <= -1 + tmp8 and i2 >= 0 and i2 <= -1 + tmp10 }; +; CHECK: Schedule := +; CHECK: [tmp, tmp8, tmp10] -> { Stmt_for_body_6[i0, i1, i2] -> [i0, i1, i2] }; +; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK: [tmp, tmp8, tmp10] -> { Stmt_for_body_6[i0, i1, i2] -> MemRef_data[i0, i1, i2] }; +; CHECK: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK: [tmp, tmp8, tmp10] -> { Stmt_for_body_6[i0, i1, i2] -> MemRef_data[i0, i1, i2] }; +; CHECK: } +; +; int bounds[3]; +; double data[1024][1024][1024]; +; +; void foo() { +; int i, j, k; +; for (k = 0; k < bounds[2]; k++) +; for (j = 0; j < bounds[1]; j++) +; for (i = 0; i < bounds[0]; i++) +; data[k][j][i] += i + j + k; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +@bounds = common global [3 x i32] zeroinitializer, align 4 +@data = common global [1024 x [1024 x [1024 x double]]] zeroinitializer, align 16 + +define void @foo() { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc.16, %entry + %indvars.iv5 = phi i64 [ %indvars.iv.next6, %for.inc.16 ], [ 0, %entry ] + %tmp = load i32, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @bounds, i64 0, i64 2), align 4 + %tmp7 = sext i32 %tmp to i64 + %cmp = icmp slt i64 %indvars.iv5, %tmp7 + br i1 %cmp, label %for.body, label %for.end.18 + +for.body: ; preds = %for.cond + br label %for.cond.1 + +for.cond.1: ; preds = %for.inc.13, %for.body + %indvars.iv3 = phi i64 [ %indvars.iv.next4, %for.inc.13 ], [ 0, %for.body ] + %tmp8 = load i32, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @bounds, i64 0, i64 1), align 4 + %tmp9 = sext i32 %tmp8 to i64 + %cmp2 = icmp slt i64 %indvars.iv3, %tmp9 + br i1 %cmp2, label %for.body.3, label %for.end.15 + +for.body.3: ; preds = %for.cond.1 + br label %for.cond.4 + +for.cond.4: ; preds = %for.inc, %for.body.3 + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.body.3 ] + %tmp10 = load i32, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @bounds, i64 0, i64 0), align 4 + %tmp11 = sext i32 %tmp10 to i64 + %cmp5 = icmp slt i64 %indvars.iv, %tmp11 + br i1 %cmp5, label %for.body.6, label %for.end + +for.body.6: ; preds = %for.cond.4 + %tmp12 = add nsw i64 %indvars.iv, %indvars.iv3 + %tmp13 = add nsw i64 %tmp12, %indvars.iv5 + %tmp14 = trunc i64 %tmp13 to i32 + %conv = sitofp i32 %tmp14 to double + %arrayidx11 = getelementptr inbounds [1024 x [1024 x [1024 x double]]], [1024 x [1024 x [1024 x double]]]* @data, i64 0, i64 %indvars.iv5, i64 %indvars.iv3, i64 %indvars.iv + %tmp15 = load double, double* %arrayidx11, align 8 + %add12 = fadd double %tmp15, %conv + store double %add12, double* %arrayidx11, align 8 + br label %for.inc + +for.inc: ; preds = %for.body.6 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.cond.4 + +for.end: ; preds = %for.cond.4 + br label %for.inc.13 + +for.inc.13: ; preds = %for.end + %indvars.iv.next4 = add nuw nsw i64 %indvars.iv3, 1 + br label %for.cond.1 + +for.end.15: ; preds = %for.cond.1 + br label %for.inc.16 + +for.inc.16: ; preds = %for.end.15 + %indvars.iv.next6 = add nuw nsw i64 %indvars.iv5, 1 + br label %for.cond + +for.end.18: ; preds = %for.cond + ret void +} Index: test/ScopInfo/required-invariant-loop-bounds.ll =================================================================== --- /dev/null +++ test/ScopInfo/required-invariant-loop-bounds.ll @@ -0,0 +1,66 @@ +; RUN: opt %loadPolly -polly-scops -analyze < %s | FileCheck %s +; +; CHECK: Invariant Accesses: { +; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK: MemRef_bounds[0] }; +; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK: MemRef_bounds[1] : tmp >= 0 }; +; CHECK: } + +; double A[1000][1000]; +; long bounds[2]; +; +; void foo() { +; +; for (long i = 0; i <= bounds[0]; i++) +; for (long j = 0; j <= bounds[1]; j++) +; A[i][j] += i + j; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +@bounds = common global [2 x i64] zeroinitializer, align 16 +@A = common global [1000 x [1000 x double]] zeroinitializer, align 16 + +define void @foo() { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc.6, %entry + %i.0 = phi i64 [ 0, %entry ], [ %inc7, %for.inc.6 ] + %tmp = load i64, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @bounds, i64 0, i64 0), align 16 + %cmp = icmp sgt i64 %i.0, %tmp + br i1 %cmp, label %for.end.8, label %for.body + +for.body: ; preds = %for.cond + br label %for.cond.1 + +for.cond.1: ; preds = %for.inc, %for.body + %j.0 = phi i64 [ 0, %for.body ], [ %inc, %for.inc ] + %tmp1 = load i64, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @bounds, i64 0, i64 1), align 8 + %cmp2 = icmp sgt i64 %j.0, %tmp1 + br i1 %cmp2, label %for.end, label %for.body.3 + +for.body.3: ; preds = %for.cond.1 + %add = add nsw i64 %i.0, %j.0 + %conv = sitofp i64 %add to double + %arrayidx4 = getelementptr inbounds [1000 x [1000 x double]], [1000 x [1000 x double]]* @A, i64 0, i64 %i.0, i64 %j.0 + %tmp2 = load double, double* %arrayidx4, align 8 + %add5 = fadd double %tmp2, %conv + store double %add5, double* %arrayidx4, align 8 + br label %for.inc + +for.inc: ; preds = %for.body.3 + %inc = add nuw nsw i64 %j.0, 1 + br label %for.cond.1 + +for.end: ; preds = %for.cond.1 + br label %for.inc.6 + +for.inc.6: ; preds = %for.end + %inc7 = add nuw nsw i64 %i.0, 1 + br label %for.cond + +for.end.8: ; preds = %for.cond + ret void +} Index: test/ScopInfo/tempscop-printing.ll =================================================================== --- test/ScopInfo/tempscop-printing.ll +++ test/ScopInfo/tempscop-printing.ll @@ -14,6 +14,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" ; CHECK-LABEL: Function: f +; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: MemRef_init_ptr[0] + define void @f(i64* noalias %A, i64 %N, i64* noalias %init_ptr) nounwind { entry: br label %for.i @@ -24,12 +27,8 @@ br label %entry.next entry.next: -; CHECK: Stmt_entry_next +; CHECK-NOT: Stmt_entry_next %init = load i64, i64* %init_ptr -; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; CHECK-NEXT: [N] -> { Stmt_entry_next[i0] -> MemRef_init_ptr[0] }; -; CHECK: MustWriteAccess := [Reduction Type: NONE] [Scalar: 1] -; CHECK-NEXT: [N] -> { Stmt_entry_next[i0] -> MemRef_init[] }; br label %for.j for.j: @@ -55,6 +54,8 @@ } ; CHECK-LABEL: Function: g +; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NEXT: MemRef_init_ptr[0] define void @g(i64* noalias %A, i64 %N, i64* noalias %init_ptr) nounwind { entry: br label %for.i @@ -65,12 +66,8 @@ br label %entry.next entry.next: -; CHECK: Stmt_entry_next +; CHECK-NOT: Stmt_entry_next %init = load i64, i64* %init_ptr -; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; CHECK-NEXT: [N] -> { Stmt_entry_next[i0] -> MemRef_init_ptr[0] }; -; CHECK: MustWriteAccess := [Reduction Type: NONE] [Scalar: 1] -; CHECK-NEXT: [N] -> { Stmt_entry_next[i0] -> MemRef_init[] }; br label %for.j for.j: