Index: include/polly/CodeGen/BlockGenerators.h =================================================================== --- include/polly/CodeGen/BlockGenerators.h +++ include/polly/CodeGen/BlockGenerators.h @@ -65,6 +65,20 @@ /// @brief Generate a new basic block for a polyhedral statement. class BlockGenerator { public: + /// @brief Map type to resolve scalar dependences. + /// + /// @see The ScalarMap and PHIOpMap member. + using ScalarAllocaMapTy = DenseMap; + + /// @brief Simple vector of instructions to store escape users. + using EscapeUserVectorTy = SmallVector; + + /// @brief Map type to resolve escaping users for scalar instructions. + /// + /// @see The EscapeMap member. + using EscapeUsersAllocaMapTy = + DenseMap>; + /// @brief Create a generator for basic blocks. /// /// @param Builder The LLVM-IR Builder used to generate the statement. The @@ -73,9 +87,14 @@ /// @param LI The loop info for the current function /// @param SE The scalar evolution info for the current function /// @param DT The dominator tree of this function. + /// @param ScalarMap Map from scalars to their demoted location. + /// @param PHIOpMap Map from PHIs to their demoted operand location. + /// @param EscapeMap Map from scalars to their escape users and locations. /// @param ExprBuilder An expression builder to generate new access functions. BlockGenerator(PollyIRBuilder &Builder, LoopInfo &LI, ScalarEvolution &SE, - DominatorTree &DT, IslExprBuilder *ExprBuilder = nullptr); + DominatorTree &DT, ScalarAllocaMapTy &ScalarMap, + ScalarAllocaMapTy &PHIOpMap, EscapeUsersAllocaMapTy &EscapeMap, + IslExprBuilder *ExprBuilder = nullptr); /// @brief Copy the basic block. /// @@ -89,6 +108,18 @@ /// @param LTS A map from old loops to new induction variables as SCEVs. void copyStmt(ScopStmt &Stmt, ValueMapT &GlobalMap, LoopToScevMapT <S); + /// @brief Finalize the code generation for the SCoP @p S. + /// + /// This will initialize and finalize the scalar variables we demoted during + /// the code generation. + /// + /// @see createScalarInitialization(Region &, ValueMapT &) + /// @see createScalarFinalization(Region &) + void finalizeSCoP(Scop &S, ValueMapT &VMap); + + /// @brief An empty destructor + virtual ~BlockGenerator(){}; + protected: PollyIRBuilder &Builder; LoopInfo &LI; @@ -98,6 +129,29 @@ /// @brief The dominator tree of this function. DominatorTree &DT; + /// @brief The entry block of the current function. + BasicBlock *EntryBB; + + /// @brief Maps to resolve scalar dependences for PHI operands and scalars. + /// + ///{ + + /// The PHIOpMap is used to get the alloca to communicate a value to a PHI + /// node, hence when the operand of a PHI is demoted the corresponding write + /// access will use the PHIOpMap to look for the correct alloca. PHI nodes + /// will then read that location in order to get the correct/current operand + /// value. + ScalarAllocaMapTy &PHIOpMap; + + /// The ScalarMap is used in __all__ other cases, thus always when a scalar + /// variable is read/written and the write is not because the scalar is a PHI + /// operand. + ScalarAllocaMapTy &ScalarMap; + ///} + + /// @brief Map from instructions to their escape users as well as the alloca. + EscapeUsersAllocaMapTy &EscapeMap; + /// @brief Split @p BB to create a new one we can use to clone @p BB in. BasicBlock *splitBB(BasicBlock *BB); @@ -130,6 +184,64 @@ void copyBB(ScopStmt &Stmt, BasicBlock *BB, BasicBlock *BBCopy, ValueMapT &BBMap, ValueMapT &GlobalMap, LoopToScevMapT <S); + /// @brief Return the alloca for @p ScalarBase in @p Map. + /// + /// If no alloca was mapped to @p ScalarBase in @p Map a new one is created + /// and named after @p ScalarBase with the suffix @p NameExt. + /// + /// @param ScalarBase The demoted scalar instruction. + /// @param Map The map we should look for a mapped alloca instruction. + /// @param NameExt The suffix we add to the name of a new created alloca. + /// @param IsNew If set it will hold true iff the alloca was created. + /// + /// @returns The alloca for @p ScalarBase in @p Map. + AllocaInst *getOrCreateAlloca(Instruction *ScalarBase, ScalarAllocaMapTy &Map, + const char *NameExt = ".s2a", + bool *IsNew = nullptr); + + /// @brief Generate reload of scalars demoted to memory and needed by @p Inst. + /// + /// @param Stmt The statement we generate code for. + /// @param Inst The instruction that might need reloaded values. + /// @param BBMap A mapping from old values to their new values in this block. + virtual void generateScalarLoads(ScopStmt &Stmt, const Instruction *Inst, + ValueMapT &BBMap); + + /// @brief Generate the scalar stores for the given statement. + /// + /// After the statement @p Stmt was copied all inner-SCoP scalar dependences + /// starting in @p Stmt (hence all scalar write accesses in @p Stmt) need to + /// be demoted to memory. + /// + /// @param Stmt The statement we generate code for. + /// @param BB The basic block we generate code for. + /// @param BBMap A mapping from old values to their new values in this block. + /// @param GlobalMap A mapping for globally replaced values. + virtual void generateScalarStores(ScopStmt &Stmt, BasicBlock *BB, + ValueMapT &BBMAp, ValueMapT &GlobalMap); + + /// @brief Handle users of @p Inst outside the SCoP. + /// + /// @param R The current SCoP region. + /// @param Inst The current instruction we check. + /// @param InstCopy The copy of the instruction @p Inst in the optimized SCoP. + void handleOutsideUsers(const Region &R, Instruction *Inst, Value *InstCopy); + + /// @brief Initialize the memory of demoted scalars. + /// + /// If a PHI node was demoted and one of its predecessor blocks was outside + /// the SCoP we need to initialize the memory cell we demoted the PHI into + /// with the value corresponding to that predecessor. As a SCoP is a + /// __single__ entry region there is at most one such predecessor. + void createScalarInitialization(Region &R, ValueMapT &VMap); + + /// @brief Promote the values of demoted scalars after the SCoP. + /// + /// If a scalar value was used outside the SCoP we need to promote the value + /// stored in the memory cell allocated for that scalar and combine it with + /// the original value in the non-optimized SCoP. + void createScalarFinalization(Region &R); + /// @brief Get the new version of a value. /// /// Given an old value, we first check if a new version of this value is @@ -160,8 +272,9 @@ Value *getNewValue(ScopStmt &Stmt, const Value *Old, ValueMapT &BBMap, ValueMapT &GlobalMap, LoopToScevMapT <S, Loop *L) const; - void copyInstScalar(ScopStmt &Stmt, const Instruction *Inst, ValueMapT &BBMap, - ValueMapT &GlobalMap, LoopToScevMapT <S); + Instruction *copyInstScalar(ScopStmt &Stmt, const Instruction *Inst, + ValueMapT &BBMap, ValueMapT &GlobalMap, + LoopToScevMapT <S); /// @brief Get the innermost loop that surrounds an instruction. /// @@ -177,13 +290,24 @@ const Value *Pointer, ValueMapT &BBMap, ValueMapT &GlobalMap, LoopToScevMapT <S); - Value *generateScalarLoad(ScopStmt &Stmt, const LoadInst *load, - ValueMapT &BBMap, ValueMapT &GlobalMap, - LoopToScevMapT <S); + Instruction *generateScalarLoad(ScopStmt &Stmt, const LoadInst *load, + ValueMapT &BBMap, ValueMapT &GlobalMap, + LoopToScevMapT <S); - Value *generateScalarStore(ScopStmt &Stmt, const StoreInst *store, - ValueMapT &BBMap, ValueMapT &GlobalMap, - LoopToScevMapT <S); + Instruction *generateScalarStore(ScopStmt &Stmt, const StoreInst *store, + ValueMapT &BBMap, ValueMapT &GlobalMap, + LoopToScevMapT <S); + + /// @brief Copy a single PHI instruction. + /// + /// The implementation in the BlockGenerator is trivial, however it allows + /// subclasses to handle PHIs different. + /// + /// @returns The nullptr as the BlockGenerator does not copy PHIs. + virtual Value *copyPHIInstruction(ScopStmt &, const PHINode *, ValueMapT &, + ValueMapT &, LoopToScevMapT &) { + return nullptr; + } /// @brief Copy a single Instruction. /// @@ -201,9 +325,24 @@ /// variable to their new values /// (for values recalculated in the new ScoP, but not /// within this basic block). - void copyInstruction(ScopStmt &Stmt, const Instruction *Inst, - ValueMapT &BBMap, ValueMapT &GlobalMap, - LoopToScevMapT <S); + /// @returns The copied instruction or nullptr if no copy was made. + Value *copyInstruction(ScopStmt &Stmt, const Instruction *Inst, + ValueMapT &BBMap, ValueMapT &GlobalMap, + LoopToScevMapT <S); + + /// @brief Helper to get the newest version of @p ScalarValue. + /// + /// @param ScalarValue The original value needed. + /// @param R The current SCoP region. + /// @param ReloadMap The scalar map for demoted values. + /// @param BBMap A mapping from old values to their new values + /// (for values recalculated within this basic block). + /// @param GlobalMap A mapping from old values to their new values + /// (for values recalculated in the new ScoP, but not + /// within this basic block). + Value *getNewScalarValue(Value *ScalarValue, const Region &R, + ScalarAllocaMapTy &ReloadMap, ValueMapT &BBMap, + ValueMapT &GlobalMap); }; /// @brief Generate a new vector basic block for a polyhedral statement. @@ -376,12 +515,82 @@ /// @param LTS A map from old loops to new induction variables as SCEVs. void copyStmt(ScopStmt &Stmt, ValueMapT &GlobalMap, LoopToScevMapT <S); + /// @brief An empty destructor + virtual ~RegionGenerator(){}; + private: + /// @brief A map from old to new blocks in the region. + DenseMap BlockMap; + + /// @brief The "BBMaps" for the whole region (one for each block). + DenseMap RegionMaps; + + /// @brief Mapping to remember PHI nodes that still need incoming values. + using PHINodePairTy = std::pair; + DenseMap> IncompletePHINodeMap; + /// @brief Repair the dominance tree after we created a copy block for @p BB. /// /// @returns The immediate dominator in the DT for @p BBCopy if in the region. - BasicBlock *repairDominance(BasicBlock *BB, BasicBlock *BBCopy, - DenseMap &BlockMap); + BasicBlock *repairDominance(BasicBlock *BB, BasicBlock *BBCopy); + + /// @brief Add the new operand from the copy of @p IncomingBB to @p PHICopy. + /// + /// @param Stmt The statement to code generate. + /// @param PHI The original PHI we copy. + /// @param PHICopy The copy of @p PHI. + /// @param IncomingBB An incoming block of @p PHI. + /// @param GlobalMap A mapping from old values to their new values + /// (for values recalculated in the new ScoP, but not + /// within this basic block). + /// @param LTS A map from old loops to new induction variables as + /// SCEVs. + void addOperandToPHI(ScopStmt &Stmt, const PHINode *PHI, PHINode *PHICopy, + BasicBlock *IncomingBB, ValueMapT &GlobalMap, + LoopToScevMapT <S); + + /// @brief Generate reload of scalars demoted to memory and needed by @p Inst. + /// + /// @param Stmt The statement we generate code for. + /// @param Inst The instruction that might need reloaded values. + /// @param BBMap A mapping from old values to their new values in this block. + virtual void generateScalarLoads(ScopStmt &Stmt, const Instruction *Inst, + ValueMapT &BBMap) override; + + /// @brief Generate the scalar stores for the given statement. + /// + /// After the statement @p Stmt was copied all inner-SCoP scalar dependences + /// starting in @p Stmt (hence all scalar write accesses in @p Stmt) need to + /// be demoted to memory. + /// + /// @param Stmt The statement we generate code for. + /// @param BB The basic block we generate code for. + /// @param BBMap A mapping from old values to their new values in this block. + /// @param GlobalMap A mapping from old values to their new values + /// (for values recalculated in the new ScoP, but not + /// within this basic block). + virtual void generateScalarStores(ScopStmt &Stmt, BasicBlock *BB, + ValueMapT &BBMAp, + ValueMapT &GlobalMap) override; + + /// @brief Copy a single PHI instruction. + /// + /// This copies a single PHI instruction and updates references to old values + /// with references to new values, as defined by GlobalMap and BBMap. + /// + /// @param Stmt The statement to code generate. + /// @param PHI The PHI instruction to copy. + /// @param BBMap A mapping from old values to their new values + /// (for values recalculated within this basic block). + /// @param GlobalMap A mapping from old values to their new values + /// (for values recalculated in the new ScoP, but not + /// within this basic block). + /// @param LTS A map from old loops to new induction variables as SCEVs. + /// + /// @returns The copied instruction or nullptr if no copy was made. + virtual Value *copyPHIInstruction(ScopStmt &Stmt, const PHINode *Inst, + ValueMapT &BBMap, ValueMapT &GlobalMap, + LoopToScevMapT <S) override; }; } #endif Index: include/polly/CodeGen/IslNodeBuilder.h =================================================================== --- include/polly/CodeGen/IslNodeBuilder.h +++ include/polly/CodeGen/IslNodeBuilder.h @@ -33,13 +33,20 @@ DominatorTree &DT, Scop &S) : S(S), Builder(Builder), Annotator(Annotator), Rewriter(SE, DL, "polly"), ExprBuilder(Builder, IDToValue, Rewriter, DT, LI), - BlockGen(Builder, LI, SE, DT, &ExprBuilder), RegionGen(BlockGen), P(P), - DL(DL), LI(LI), SE(SE), DT(DT) {} + BlockGen(Builder, LI, SE, DT, ScalarMap, PHIOpMap, EscapeMap, + &ExprBuilder), + RegionGen(BlockGen), P(P), DL(DL), LI(LI), SE(SE), DT(DT) {} ~IslNodeBuilder() {} void addParameters(__isl_take isl_set *Context); void create(__isl_take isl_ast_node *Node); + + /// @brief Finalize code generation for the SCoP @p S. + /// + /// @see BlockGenerator::finalizeSCoP(Scop &S) + void finalizeSCoP(Scop &S) { BlockGen.finalizeSCoP(S, ValueMap); } + IslExprBuilder &getExprBuilder() { return ExprBuilder; } private: @@ -51,9 +58,26 @@ SCEVExpander Rewriter; IslExprBuilder ExprBuilder; + + /// @brief Maps used by the block and region generator to demote scalars. + /// + ///@{ + + /// @brief See BlockGenerator::ScalarMap. + BlockGenerator::ScalarAllocaMapTy ScalarMap; + + /// @brief See BlockGenerator::PhiOpMap. + BlockGenerator::ScalarAllocaMapTy PHIOpMap; + + /// @brief See BlockGenerator::EscapeMap. + BlockGenerator::EscapeUsersAllocaMapTy EscapeMap; + + ///@} + + /// @brief The generator used to copy a basic block. BlockGenerator BlockGen; - /// @brief Generator for region statements. + /// @brief The generator used to copy a non-affine region. RegionGenerator RegionGen; Pass *const P; @@ -80,13 +104,13 @@ /// points to and the resulting value is returned. /// /// @param Expr The expression to code generate. - llvm::Value *generateSCEV(const SCEV *Expr); + Value *generateSCEV(const SCEV *Expr); /// A set of Value -> Value remappings to apply when generating new code. /// /// When generating new code for a ScopStmt this map is used to map certain /// llvm::Values to new llvm::Values. - polly::ValueMapT ValueMap; + ValueMapT ValueMap; // Extract the upper bound of this loop // Index: include/polly/ScopInfo.h =================================================================== --- include/polly/ScopInfo.h +++ include/polly/ScopInfo.h @@ -207,6 +207,12 @@ /// Updated access relation read from JSCOP file. isl_map *newAccessRelation; + /// @brief Other memory effects which are linked to this one. + /// + /// If an instruction can cause multiple memory accesses or effects they are + /// linked together (linked list fashion). + MemoryAccess *NextMA; + void assumeNoOutOfBound(const IRAccess &Access); /// @brief Compute bounds on an over approximated access relation. @@ -370,6 +376,12 @@ /// @brief Align the parameters in the access relation to the scop context void realignParams(); + /// @brief Set the next MemoryAccess (linked list fashion). + void setNextMA(MemoryAccess *MA) { NextMA = MA; } + + /// @brief Get the next MemoryAccess (linked list fashion). + MemoryAccess *getNextMA() const { return NextMA; } + /// @brief Print the MemoryAccess. /// /// @param OS The output stream the MemoryAccess is printed to. Index: lib/Analysis/ScopInfo.cpp =================================================================== --- lib/Analysis/ScopInfo.cpp +++ lib/Analysis/ScopInfo.cpp @@ -627,7 +627,7 @@ MemoryAccess::MemoryAccess(const IRAccess &Access, Instruction *AccInst, ScopStmt *Statement, const ScopArrayInfo *SAI) : AccType(getMemoryAccessType(Access)), Statement(Statement), Inst(AccInst), - newAccessRelation(nullptr) { + newAccessRelation(nullptr), NextMA(nullptr) { isl_ctx *Ctx = Statement->getIslCtx(); BaseAddr = Access.getBase(); @@ -882,16 +882,10 @@ MemAccs.push_back(new MemoryAccess(Access, AccessInst, this, SAI)); - // We do not track locations for scalar memory accesses at the moment. - // - // We do not have a use for this information at the moment. If we need this - // at some point, the "instruction -> access" mapping needs to be enhanced - // as a single instruction could then possibly perform multiple accesses. - if (!Access.isScalar()) { - assert(!InstructionToAccess.count(AccessInst) && - "Unexpected 1-to-N mapping on instruction to access map!"); - InstructionToAccess[AccessInst] = MemAccs.back(); - } + MemoryAccess *&MA = InstructionToAccess[AccessInst]; + if (MA) + MemAccs.back()->setNextMA(MA); + MA = MemAccs.back(); } } Index: lib/CodeGen/BlockGenerators.cpp =================================================================== --- lib/CodeGen/BlockGenerators.cpp +++ lib/CodeGen/BlockGenerators.cpp @@ -85,8 +85,13 @@ BlockGenerator::BlockGenerator(PollyIRBuilder &B, LoopInfo &LI, ScalarEvolution &SE, DominatorTree &DT, + ScalarAllocaMapTy &ScalarMap, + ScalarAllocaMapTy &PHIOpMap, + EscapeUsersAllocaMapTy &EscapeMap, IslExprBuilder *ExprBuilder) - : Builder(B), LI(LI), SE(SE), ExprBuilder(ExprBuilder), DT(DT) {} + : Builder(B), LI(LI), SE(SE), ExprBuilder(ExprBuilder), DT(DT), + EntryBB(nullptr), PHIOpMap(PHIOpMap), ScalarMap(ScalarMap), + EscapeMap(EscapeMap) {} Value *BlockGenerator::getNewValue(ScopStmt &Stmt, const Value *Old, ValueMapT &BBMap, ValueMapT &GlobalMap, @@ -144,14 +149,16 @@ return nullptr; } -void BlockGenerator::copyInstScalar(ScopStmt &Stmt, const Instruction *Inst, - ValueMapT &BBMap, ValueMapT &GlobalMap, - LoopToScevMapT <S) { +Instruction *BlockGenerator::copyInstScalar(ScopStmt &Stmt, + const Instruction *Inst, + ValueMapT &BBMap, + ValueMapT &GlobalMap, + LoopToScevMapT <S) { // We do not generate debug intrinsics as we did not investigate how to // copy them correctly. At the current state, they just crash the code // generation as the meta-data operands are not correctly copied. if (isa(Inst)) - return; + return nullptr; Instruction *NewInst = Inst->clone(); @@ -164,7 +171,7 @@ assert(!isa(NewInst) && "Store instructions are always needed!"); delete NewInst; - return; + return nullptr; } NewInst->replaceUsesOfWith(OldOperand, NewOperand); @@ -175,6 +182,8 @@ if (!NewInst->getType()->isVoidTy()) NewInst->setName("p_" + Inst->getName()); + + return NewInst; } Value *BlockGenerator::getNewAccessOperand(ScopStmt &Stmt, @@ -215,61 +224,76 @@ return LI.getLoopFor(Inst->getParent()); } -Value *BlockGenerator::generateScalarLoad(ScopStmt &Stmt, const LoadInst *Load, - ValueMapT &BBMap, - ValueMapT &GlobalMap, - LoopToScevMapT <S) { +Instruction *BlockGenerator::generateScalarLoad(ScopStmt &Stmt, + const LoadInst *Load, + ValueMapT &BBMap, + ValueMapT &GlobalMap, + LoopToScevMapT <S) { const Value *Pointer = Load->getPointerOperand(); Value *NewPointer = generateLocationAccessed(Stmt, Load, Pointer, BBMap, GlobalMap, LTS); - Value *ScalarLoad = Builder.CreateAlignedLoad( + Instruction *ScalarLoad = Builder.CreateAlignedLoad( NewPointer, Load->getAlignment(), Load->getName() + "_p_scalar_"); return ScalarLoad; } -Value *BlockGenerator::generateScalarStore(ScopStmt &Stmt, - const StoreInst *Store, - ValueMapT &BBMap, - ValueMapT &GlobalMap, - LoopToScevMapT <S) { +Instruction *BlockGenerator::generateScalarStore(ScopStmt &Stmt, + const StoreInst *Store, + ValueMapT &BBMap, + ValueMapT &GlobalMap, + LoopToScevMapT <S) { const Value *Pointer = Store->getPointerOperand(); Value *NewPointer = generateLocationAccessed(Stmt, Store, Pointer, BBMap, GlobalMap, LTS); Value *ValueOperand = getNewValue(Stmt, Store->getValueOperand(), BBMap, GlobalMap, LTS, getLoopForInst(Store)); - Value *NewStore = Builder.CreateAlignedStore(ValueOperand, NewPointer, - Store->getAlignment()); + Instruction *NewStore = Builder.CreateAlignedStore(ValueOperand, NewPointer, + Store->getAlignment()); return NewStore; } -void BlockGenerator::copyInstruction(ScopStmt &Stmt, const Instruction *Inst, - ValueMapT &BBMap, ValueMapT &GlobalMap, - LoopToScevMapT <S) { +Value *BlockGenerator::copyInstruction(ScopStmt &Stmt, const Instruction *Inst, + ValueMapT &BBMap, ValueMapT &GlobalMap, + LoopToScevMapT <S) { + + // First check for possible scalar dependences for this instruction. + generateScalarLoads(Stmt, Inst, BBMap); + // Terminator instructions control the control flow. They are explicitly // expressed in the clast and do not need to be copied. if (Inst->isTerminator()) - return; + return nullptr; - if (canSynthesize(Inst, &LI, &SE, &Stmt.getParent()->getRegion())) - return; + Loop *L = getLoopForInst(Inst); + if ((Stmt.isBlockStmt() || !Stmt.getRegion()->contains(L)) && + canSynthesize(Inst, &LI, &SE, &Stmt.getParent()->getRegion())) { + Value *NewValue = getNewValue(Stmt, Inst, BBMap, GlobalMap, LTS, L); + BBMap[Inst] = NewValue; + return NewValue; + } if (const LoadInst *Load = dyn_cast(Inst)) { - Value *NewLoad = generateScalarLoad(Stmt, Load, BBMap, GlobalMap, LTS); + Instruction *NewLoad = + generateScalarLoad(Stmt, Load, BBMap, GlobalMap, LTS); // Compute NewLoad before its insertion in BBMap to make the insertion // deterministic. BBMap[Load] = NewLoad; - return; + return NewLoad; } if (const StoreInst *Store = dyn_cast(Inst)) { - Value *NewStore = generateScalarStore(Stmt, Store, BBMap, GlobalMap, LTS); + Instruction *NewStore = + generateScalarStore(Stmt, Store, BBMap, GlobalMap, LTS); // Compute NewStore before its insertion in BBMap to make the insertion // deterministic. BBMap[Store] = NewStore; - return; + return NewStore; } + if (const PHINode *PHI = dyn_cast(Inst)) + return copyPHIInstruction(Stmt, PHI, BBMap, GlobalMap, LTS); + // Skip some special intrinsics for which we do not adjust the semantics to // the new schedule. All others are handled like every other instruction. if (auto *IT = dyn_cast(Inst)) { @@ -287,14 +311,14 @@ case llvm::Intrinsic::donothing: case llvm::Intrinsic::assume: case llvm::Intrinsic::expect: - return; + return nullptr; default: // Other intrinsics are copied. break; } } - copyInstScalar(Stmt, Inst, BBMap, GlobalMap, LTS); + return copyInstScalar(Stmt, Inst, BBMap, GlobalMap, LTS); } void BlockGenerator::copyStmt(ScopStmt &Stmt, ValueMapT &GlobalMap, @@ -327,8 +351,342 @@ ValueMapT &BBMap, ValueMapT &GlobalMap, LoopToScevMapT <S) { Builder.SetInsertPoint(CopyBB->begin()); + EntryBB = &CopyBB->getParent()->getEntryBlock(); + + for (Instruction &Inst : *BB) { + Value *InstCopy = copyInstruction(Stmt, &Inst, BBMap, GlobalMap, LTS); + (void)InstCopy; + assert((InstCopy == nullptr || BBMap[&Inst] == InstCopy) && + "Bad copy mapping found"); + } + + // After a basic block was copied store all scalars that escape this block + // in their alloca. First the scalars that have dependences inside the SCoP, + // then the ones that might escape the SCoP. + generateScalarStores(Stmt, BB, BBMap, GlobalMap); + + const Region &R = Stmt.getParent()->getRegion(); for (Instruction &Inst : *BB) - copyInstruction(Stmt, &Inst, BBMap, GlobalMap, LTS); + handleOutsideUsers(R, &Inst, BBMap[&Inst]); +} + +AllocaInst *BlockGenerator::getOrCreateAlloca(Instruction *ScalarBase, + ScalarAllocaMapTy &Map, + const char *NameExt, + bool *IsNew) { + + // Check if an alloca was cached for the base instruction. + AllocaInst *&Addr = Map[ScalarBase]; + + // If needed indicate if it was found already or will be created. + if (IsNew) + *IsNew = (Addr == nullptr); + + // If no alloca was found create one and insert it in the entry block. + if (!Addr) { + auto *Ty = ScalarBase->getType(); + Addr = new AllocaInst(Ty, ScalarBase->getName() + NameExt); + Addr->insertBefore(EntryBB->getFirstInsertionPt()); + } + + return Addr; +} + +void BlockGenerator::handleOutsideUsers(const Region &R, Instruction *Inst, + Value *InstCopy) { + BasicBlock *ExitBB = R.getExit(); + + EscapeUserVectorTy EscapeUsers; + for (User *U : Inst->users()) { + + // Non-instruction user will never escape. + Instruction *UI = dyn_cast(U); + if (!UI) + continue; + + if (R.contains(UI) && ExitBB != UI->getParent()) + continue; + + EscapeUsers.push_back(UI); + } + + // Exit if no escape uses were found. + if (EscapeUsers.empty()) + return; + + // If there are escape users we get the alloca for this instruction and put + // it in the EscapeMap for later finalization. However, if the alloca was not + // created by a already handled scalar dependence we have to initialize it + // also. Lastly, if the instruction was copied multiple times we already did + // this and can exit. + if (EscapeMap.count(Inst)) + return; + + // Get or create an escape alloca for this instruction. + bool IsNew; + AllocaInst *ScalarAddr = + getOrCreateAlloca(Inst, ScalarMap, ".escape", &IsNew); + + // Remember that this instruction has escape uses and the escape alloca. + EscapeMap[Inst] = std::make_pair(ScalarAddr, std::move(EscapeUsers)); + + // If the escape alloca was just created store the instruction in there, + // otherwise that happened already. + if (IsNew) { + assert(InstCopy && "Except PHIs every instruction should have a copy!"); + Builder.CreateStore(InstCopy, ScalarAddr); + } +} + +void BlockGenerator::generateScalarLoads(ScopStmt &Stmt, + const Instruction *Inst, + ValueMapT &BBMap) { + + // Iterate over all memory accesses for the given instruction and handle all + // scalar reads. + if (MemoryAccess *MA = Stmt.lookupAccessFor(Inst)) { + do { + if (!MA->isScalar() || !MA->isRead()) + continue; + + Instruction *ScalarBase = cast(MA->getBaseAddr()); + Instruction *ScalarInst = MA->getAccessInstruction(); + + PHINode *ScalarBasePHI = dyn_cast(ScalarBase); + + // This is either a common scalar use (second case) or the use of a phi + // operand by the PHI node (first case). + if (ScalarBasePHI == ScalarInst) { + AllocaInst *PHIOpAddr = + getOrCreateAlloca(ScalarBase, PHIOpMap, ".phiops"); + LoadInst *LI = + Builder.CreateLoad(PHIOpAddr, PHIOpAddr->getName() + ".reload"); + BBMap[ScalarBase] = LI; + } else { + // For non-PHI operand uses we look up the alloca in the ScalarMap, + // reload it and add the mapping to the ones in the current basic block. + AllocaInst *ScalarAddr = + getOrCreateAlloca(ScalarBase, ScalarMap, ".s2a"); + LoadInst *LI = + Builder.CreateLoad(ScalarAddr, ScalarAddr->getName() + ".reload"); + BBMap[ScalarBase] = LI; + } + + } while ((MA = MA->getNextMA())); + } +} + +Value *BlockGenerator::getNewScalarValue(Value *ScalarValue, const Region &R, + ScalarAllocaMapTy &ReloadMap, + ValueMapT &BBMap, + ValueMapT &GlobalMap) { + // If the value we want to store is a instruction we might have demoted it + // in order to make it accessible here. In such a case a reload is + // necessary. If it is no instruction it will always be a value that + // dominates the current point and we can just use it. In total there are 4 + // options: + // (1) The value is no instruction ==> use the value. + // (2) The value is an instruction that was split out the region prior to + // code generation ==> use the instruction as it dominates the region. + // (3) The value is an instruction: + // (a) The value was defined in the current block, thus a copy is in + // the BBMap ==> use the mapped value. + // (b) The value was defined in a previous block, thus demoted it + // earlier ==> use the reloaded value. + Instruction *ScalarValueInst = dyn_cast(ScalarValue); + if (!ScalarValueInst) + return ScalarValue; + + if (!R.contains(ScalarValueInst)) { + if (Value *ScalarValueCopy = GlobalMap.lookup(ScalarValueInst)) + return /* Case (3a) */ ScalarValueCopy; + else + return /* Case 2 */ ScalarValue; + } + + if (Value *ScalarValueCopy = BBMap.lookup(ScalarValueInst)) + return /* Case (3a) */ ScalarValueCopy; + + // Case (3b) + assert(ReloadMap.count(ScalarValueInst) && + "ScalarInst not mapped in the block and not in the given reload map!"); + Value *ReloadAddr = ReloadMap[ScalarValueInst]; + ScalarValue = + Builder.CreateLoad(ReloadAddr, ReloadAddr->getName() + ".reload"); + + return ScalarValue; +} + +void BlockGenerator::generateScalarStores(ScopStmt &Stmt, BasicBlock *BB, + ValueMapT &BBMap, + ValueMapT &GlobalMap) { + const Region &R = Stmt.getParent()->getRegion(); + + assert(Stmt.isBlockStmt() && BB == Stmt.getBasicBlock() && + "Region statements need to use the generateScalarStores() " + "function in the RegionGenerator"); + + // Set to remember a store to the phiops alloca of a PHINode. It is needed as + // we might have multiple write accesses to the same PHI and while one is the + // self write of the PHI (to the ScalarMap alloca) the other is the write to + // the operand alloca (PHIOpMap). + SmallPtrSet SeenPHIs; + + // Iterate over all accesses in the given statement. + for (auto MI = Stmt.begin(), ME = Stmt.end(); MI != ME; MI++) { + MemoryAccess *MA = *MI; + + // Skip non-scalar and read accesses. + if (!MA->isScalar() || MA->isRead()) + continue; + + Instruction *ScalarBase = cast(MA->getBaseAddr()); + Instruction *ScalarInst = MA->getAccessInstruction(); + PHINode *ScalarBasePHI = dyn_cast(ScalarBase); + + // Get the alloca node for the base instruction and the value we want to + // store. In total there are 4 options: + // (1) The base is no PHI, hence it is a simple scalar def-use chain. + // (2) The base is a PHI, + // (a) and the write is caused by an operand in the block. + // (b) and it is the PHI self write (same as case (1)). + // (c) (2a) and (2b) are not distinguishable. + // For case (1) and (2b) we get the alloca from the scalar map and the value + // we want to store is initialized with the instruction attached to the + // memory access. For case (2a) we get the alloca from the PHI operand map + // and the value we want to store is initialized with the incoming value for + // this block. The tricky case (2c) is when both (2a) and (2b) match. This + // happens if the PHI operand is in the same block as the PHI. To handle + // that we choose the alloca of (2a) first and (2b) for the next write + // access to that PHI (there must be 2). + Value *ScalarValue = nullptr; + AllocaInst *ScalarAddr = nullptr; + + if (!ScalarBasePHI) { + // Case (1) + ScalarAddr = getOrCreateAlloca(ScalarBase, ScalarMap, ".s2a"); + ScalarValue = ScalarInst; + } else { + int PHIIdx = ScalarBasePHI->getBasicBlockIndex(BB); + if (ScalarBasePHI != ScalarInst) { + // Case (2a) + assert(PHIIdx >= 0 && "Bad scalar write to PHI operand"); + SeenPHIs.insert(ScalarBasePHI); + ScalarAddr = getOrCreateAlloca(ScalarBase, PHIOpMap, ".phiops"); + ScalarValue = ScalarBasePHI->getIncomingValue(PHIIdx); + } else if (PHIIdx < 0) { + // Case (2b) + ScalarAddr = getOrCreateAlloca(ScalarBase, ScalarMap, ".s2a"); + ScalarValue = ScalarInst; + } else { + // Case (2c) + if (SeenPHIs.insert(ScalarBasePHI).second) { + // First access ==> same as (2a) + ScalarAddr = getOrCreateAlloca(ScalarBase, PHIOpMap, ".phiops"); + ScalarValue = ScalarBasePHI->getIncomingValue(PHIIdx); + } else { + // Second access ==> same as (2b) + ScalarAddr = getOrCreateAlloca(ScalarBase, ScalarMap, ".s2a"); + ScalarValue = ScalarInst; + } + } + } + + ScalarValue = + getNewScalarValue(ScalarValue, R, ScalarMap, BBMap, GlobalMap); + Builder.CreateStore(ScalarValue, ScalarAddr); + } +} + +void BlockGenerator::createScalarInitialization(Region &R, + ValueMapT &GlobalMap) { + // The split block __just before__ the region and optimized region. + BasicBlock *SplitBB = R.getEnteringBlock(); + errs() << "SplitBB: " << *SplitBB << "\n"; + BranchInst *SplitBBTerm = cast(SplitBB->getTerminator()); + assert(SplitBBTerm->getNumSuccessors() == 2 && "Bad region entering block!"); + + // Get the start block of the __optimized__ region. + BasicBlock *StartBB = SplitBBTerm->getSuccessor(0); + errs() << "StartBB: " << *StartBB << "\n"; + if (StartBB == R.getEntry()) + StartBB = SplitBBTerm->getSuccessor(1); + errs() << "StartBB: " << *StartBB << "\n"; + + // For each PHI predecessor outside the region store the incoming operand + // value prior to entering the optimized region. + Builder.SetInsertPoint(StartBB->getTerminator()); + + ScalarAllocaMapTy EmptyMap; + for (const auto &PHIOpMapping : PHIOpMap) { + const PHINode *PHI = cast(PHIOpMapping.getFirst()); + errs() << "PHI: " << *PHI << "\n"; + + // Check if this PHI has the split block as predecessor (that is the only + // possible predecessor outside the SCoP). + int idx = PHI->getBasicBlockIndex(SplitBB); + errs() << "idx: " << idx << "\n"; + if (idx < 0) + continue; + + Value *ScalarValue = PHI->getIncomingValue(idx); + errs() << "ScalarValue: " << *ScalarValue << "\n"; + ScalarValue = + getNewScalarValue(ScalarValue, R, EmptyMap, GlobalMap, GlobalMap); + errs() << "ScalarValue: " << *ScalarValue << "\n"; + + // If the split block is the predecessor initialize the PHI operator alloca. + Builder.CreateStore(ScalarValue, PHIOpMapping.getSecond()); + } +} + +void BlockGenerator::createScalarFinalization(Region &R) { + // The exit block of the __unoptimized__ region. + BasicBlock *ExitBB = R.getExitingBlock(); + // The merge block __just after__ the region and the optimized region. + BasicBlock *MergeBB = R.getExit(); + + // The exit block of the __optimized__ region. + BasicBlock *OptExitBB = *(pred_begin(MergeBB)); + if (OptExitBB == ExitBB) + OptExitBB = *(++pred_begin(MergeBB)); + + Builder.SetInsertPoint(OptExitBB->getTerminator()); + for (const auto &EscapeMapping : EscapeMap) { + // Extract the escaping instruction and the escaping users as well as the + // alloca the instruction was demoted to. + Instruction *EscapeInst = EscapeMapping.getFirst(); + const auto &EscapeMappingValue = EscapeMapping.getSecond(); + const EscapeUserVectorTy &EscapeUsers = EscapeMappingValue.second; + AllocaInst *ScalarAddr = EscapeMappingValue.first; + + // Reload the demoted instruction in the optimized version of the SCoP. + Instruction *EscapeInstReload = + Builder.CreateLoad(ScalarAddr, EscapeInst->getName() + ".final_reload"); + + // Create the merge PHI that merges the optimized and unoptimized version. + PHINode *MergePHI = PHINode::Create(EscapeInst->getType(), 2, + EscapeInst->getName() + ".merge"); + MergePHI->insertBefore(MergeBB->getFirstInsertionPt()); + + // Add the respective values to the merge PHI. + MergePHI->addIncoming(EscapeInstReload, OptExitBB); + MergePHI->addIncoming(EscapeInst, ExitBB); + + // The information of scalar evolution about the escaping instruction needs + // to be revoked so the new merged instruction will be used. + if (SE.isSCEVable(EscapeInst->getType())) + SE.forgetValue(EscapeInst); + + // Replace all uses of the demoted instruction with the merge PHI. + for (Instruction *EUser : EscapeUsers) + EUser->replaceUsesOfWith(EscapeInst, MergePHI); + } +} + +void BlockGenerator::finalizeSCoP(Scop &S, ValueMapT &GlobalMap) { + createScalarInitialization(S.getRegion(), GlobalMap); + createScalarFinalization(S.getRegion()); } VectorBlockGenerator::VectorBlockGenerator(BlockGenerator &BlockGen, @@ -683,9 +1041,8 @@ copyInstruction(Stmt, &Inst, VectorBlockMap, ScalarBlockMap); } -BasicBlock *RegionGenerator::repairDominance( - BasicBlock *BB, BasicBlock *BBCopy, - DenseMap &BlockMap) { +BasicBlock *RegionGenerator::repairDominance(BasicBlock *BB, + BasicBlock *BBCopy) { BasicBlock *BBIDom = DT.getNode(BB)->getIDom()->getBlock(); BasicBlock *BBCopyIDom = BlockMap.lookup(BBIDom); @@ -701,20 +1058,31 @@ assert(Stmt.isRegionStmt() && "Only region statements can be copied by the block generator"); + // Forget all old mappings. + BlockMap.clear(); + RegionMaps.clear(); + IncompletePHINodeMap.clear(); + // The region represented by the statement. Region *R = Stmt.getRegion(); - // The "BBMaps" for the whole region. - DenseMap RegionMaps; + // Create a dedicated entry for the region where we can reload all demoted + // inputs. + BasicBlock *EntryBB = R->getEntry(); + BasicBlock *EntryBBCopy = + SplitBlock(Builder.GetInsertBlock(), Builder.GetInsertPoint(), &DT, &LI); + EntryBBCopy->setName("polly.stmt." + EntryBB->getName() + ".entry"); + Builder.SetInsertPoint(EntryBBCopy->begin()); - // A map from old to new blocks in the region - DenseMap BlockMap; + for (auto PI = pred_begin(EntryBB), PE = pred_end(EntryBB); PI != PE; ++PI) + if (!R->contains(*PI)) + BlockMap[*PI] = EntryBBCopy; // Iterate over all blocks in the region in a breadth-first search. std::deque Blocks; SmallPtrSet SeenBlocks; - Blocks.push_back(R->getEntry()); - SeenBlocks.insert(R->getEntry()); + Blocks.push_back(EntryBB); + SeenBlocks.insert(EntryBB); while (!Blocks.empty()) { BasicBlock *BB = Blocks.front(); @@ -722,7 +1090,10 @@ // First split the block and update dominance information. BasicBlock *BBCopy = splitBB(BB); - BasicBlock *BBCopyIDom = repairDominance(BB, BBCopy, BlockMap); + BasicBlock *BBCopyIDom = repairDominance(BB, BBCopy); + + // In order to remap PHI nodes we store also basic block mappings. + BlockMap[BB] = BBCopy; // Get the mapping for this block and initialize it with the mapping // available at its immediate dominator (in the new region). @@ -732,22 +1103,28 @@ // Copy the block with the BlockGenerator. copyBB(Stmt, BB, BBCopy, RegionMap, GlobalMap, LTS); + // In order to remap PHI nodes we store also basic block mappings. + BlockMap[BB] = BBCopy; + + // Add values to incomplete PHI nodes waiting for this block to be copied. + for (const PHINodePairTy &PHINodePair : IncompletePHINodeMap[BB]) + addOperandToPHI(Stmt, PHINodePair.first, PHINodePair.second, BB, + GlobalMap, LTS); + IncompletePHINodeMap[BB].clear(); + // And continue with new successors inside the region. for (auto SI = succ_begin(BB), SE = succ_end(BB); SI != SE; SI++) if (R->contains(*SI) && SeenBlocks.insert(*SI).second) Blocks.push_back(*SI); - - // In order to remap PHI nodes we store also basic block mappings. - BlockMap[BB] = BBCopy; } // Now create a new dedicated region exit block and add it to the region map. BasicBlock *ExitBBCopy = SplitBlock(Builder.GetInsertBlock(), Builder.GetInsertPoint(), &DT, &LI); - ExitBBCopy->setName("polly.stmt." + R->getExit()->getName() + ".as.exit"); + ExitBBCopy->setName("polly.stmt." + R->getExit()->getName() + ".exit"); BlockMap[R->getExit()] = ExitBBCopy; - repairDominance(R->getExit(), ExitBBCopy, BlockMap); + repairDominance(R->getExit(), ExitBBCopy); // As the block generator doesn't handle control flow we need to add the // region control flow by hand after all blocks have been copied. @@ -766,6 +1143,179 @@ BICopy->eraseFromParent(); } + // Add counting PHI nodes to all loops in the region that can be used as + // replacement for SCEVs refering to the old loop. + for (BasicBlock *BB : SeenBlocks) { + Loop *L = LI.getLoopFor(BB); + if (L == nullptr || L->getHeader() != BB) + continue; + + BasicBlock *BBCopy = BlockMap[BB]; + Value *NullVal = Builder.getInt32(0); + PHINode *LoopPHI = + PHINode::Create(Builder.getInt32Ty(), 2, "polly.subregion.iv"); + Instruction *LoopPHIInc = BinaryOperator::CreateAdd( + LoopPHI, Builder.getInt32(1), "polly.subregion.iv.inc"); + LoopPHI->insertBefore(BBCopy->begin()); + LoopPHIInc->insertBefore(BBCopy->getTerminator()); + + for (auto *PredBB : make_range(pred_begin(BB), pred_end(BB))) { + if (!R->contains(PredBB)) + continue; + if (L->contains(PredBB)) + LoopPHI->addIncoming(LoopPHIInc, BlockMap[PredBB]); + else + LoopPHI->addIncoming(NullVal, BlockMap[PredBB]); + } + + for (auto *PredBBCopy : make_range(pred_begin(BBCopy), pred_end(BBCopy))) + if (LoopPHI->getBasicBlockIndex(PredBBCopy) < 0) + LoopPHI->addIncoming(NullVal, PredBBCopy); + + LTS[L] = SE.getUnknown(LoopPHI); + } + + // Add all mappings from the region to the global map so outside uses will use + // the copied instructions. + for (auto &BBMap : RegionMaps) + GlobalMap.insert(BBMap.second.begin(), BBMap.second.end()); + // Reset the old insert point for the build. Builder.SetInsertPoint(ExitBBCopy->begin()); } + +void RegionGenerator::generateScalarLoads(ScopStmt &Stmt, + const Instruction *Inst, + ValueMapT &BBMap) { + + // Inside a non-affine region PHI nodes are copied not demoted. Once the + // phi is copied it will reload all inputs from outside the region, hence + // we do not need to generate code for the read access of the operands of a + // PHI. + if (isa(Inst)) + return; + + return BlockGenerator::generateScalarLoads(Stmt, Inst, BBMap); +} + +void RegionGenerator::generateScalarStores(ScopStmt &Stmt, BasicBlock *BB, + ValueMapT &BBMap, + ValueMapT &GlobalMap) { + const Region &R = Stmt.getParent()->getRegion(); + + Region *StmtR = Stmt.getRegion(); + assert(StmtR && "Block statements need to use the generateScalarStores() " + "function in the BlockGenerator"); + + BasicBlock *ExitBB = StmtR->getExit(); + (void)ExitBB; + + // For region statements three kinds of scalar stores exists: + // (1) A definition used by a non-phi instruction outside the region. + // (2) A phi-instruction in the region entry. + // (3) A write to a phi instruction in the region exit. + // The last case is the tricky one since we do not know anymore which + // predecessor of the exit needs to store the operand value that doesn't + // have a definition in the region. Therefore, + + // Iterate over all accesses in the given statement. + for (auto MI = Stmt.begin(), ME = Stmt.end(); MI != ME; MI++) { + MemoryAccess *MA = *MI; + + // Skip non-scalar and read accesses. + if (!MA->isScalar() || MA->isRead()) + continue; + + Instruction *ScalarBase = cast(MA->getBaseAddr()); + Instruction *ScalarInst = MA->getAccessInstruction(); + PHINode *ScalarBasePHI = dyn_cast(ScalarBase); + + Value *ScalarValue = nullptr; + AllocaInst *ScalarAddr = nullptr; + + if (!ScalarBasePHI) { + // Case (1) + ScalarAddr = getOrCreateAlloca(ScalarBase, ScalarMap, ".s2a"); + ScalarValue = ScalarInst; + } else if (ScalarBasePHI->getParent() != ExitBB) { + // Case (2) + assert(ScalarBasePHI->getParent() == StmtR->getEntry() && + "Bad PHI self write in non-affine region"); + assert(ScalarBase == ScalarInst && + "Bad PHI self write in non-affine region"); + ScalarAddr = getOrCreateAlloca(ScalarBase, ScalarMap, ".s2a"); + ScalarValue = ScalarInst; + } else { + int PHIIdx = ScalarBasePHI->getBasicBlockIndex(BB); + // Skip accesses we will not handle in this basic block but in another one + // in the statement region. + if (PHIIdx < 0) + continue; + + // Case (3) + ScalarAddr = getOrCreateAlloca(ScalarBase, PHIOpMap, ".phiops"); + ScalarValue = ScalarBasePHI->getIncomingValue(PHIIdx); + } + + ScalarValue = + getNewScalarValue(ScalarValue, R, ScalarMap, BBMap, GlobalMap); + Builder.CreateStore(ScalarValue, ScalarAddr); + } +} + +void RegionGenerator::addOperandToPHI(ScopStmt &Stmt, const PHINode *PHI, + PHINode *PHICopy, BasicBlock *IncomingBB, + ValueMapT &GlobalMap, + LoopToScevMapT <S) { + Region *StmtR = Stmt.getRegion(); + + // If the incoming block was not yet copied mark this PHI as incomplete. + // Once the block will be copied the incoming value will be added. + BasicBlock *BBCopy = BlockMap[IncomingBB]; + if (!BBCopy) { + assert(StmtR->contains(IncomingBB) && + "Bad incoming block for PHI in non-affine region"); + IncompletePHINodeMap[IncomingBB].push_back(std::make_pair(PHI, PHICopy)); + return; + } + + Value *OpCopy = nullptr; + if (StmtR->contains(IncomingBB)) { + assert(RegionMaps.count(BBCopy) && + "Incoming PHI block did not have a BBMap"); + ValueMapT &BBCopyMap = RegionMaps[BBCopy]; + + Value *Op = PHI->getIncomingValueForBlock(IncomingBB); + OpCopy = + getNewValue(Stmt, Op, BBCopyMap, GlobalMap, LTS, getLoopForInst(PHI)); + } else { + + if (PHICopy->getBasicBlockIndex(BBCopy) >= 0) + return; + + AllocaInst *PHIOpAddr = + getOrCreateAlloca(const_cast(PHI), PHIOpMap, ".phiops"); + OpCopy = new LoadInst(PHIOpAddr, PHIOpAddr->getName() + ".reload", + BlockMap[IncomingBB]->getTerminator()); + } + + assert(OpCopy && "Incoming PHI value was not copied properly"); + assert(BBCopy && "Incoming PHI block was not copied properly"); + PHICopy->addIncoming(OpCopy, BBCopy); +} + +Value *RegionGenerator::copyPHIInstruction(ScopStmt &Stmt, const PHINode *PHI, + ValueMapT &BBMap, + ValueMapT &GlobalMap, + LoopToScevMapT <S) { + unsigned NumIncoming = PHI->getNumIncomingValues(); + PHINode *PHICopy = + Builder.CreatePHI(PHI->getType(), NumIncoming, "polly." + PHI->getName()); + PHICopy->moveBefore(PHICopy->getParent()->getFirstNonPHI()); + BBMap[PHI] = PHICopy; + + for (unsigned u = 0; u < NumIncoming; u++) + addOperandToPHI(Stmt, PHI, PHICopy, PHI->getIncomingBlock(u), GlobalMap, + LTS); + return PHICopy; +} Index: lib/CodeGen/IslCodeGeneration.cpp =================================================================== --- lib/CodeGen/IslCodeGeneration.cpp +++ lib/CodeGen/IslCodeGeneration.cpp @@ -132,8 +132,13 @@ NodeBuilder.create(AstRoot); + NodeBuilder.finalizeSCoP(S); + assert(!verifyGeneratedFunction(S, *EnteringBB->getParent()) && "Verification of generated function failed"); + S.dump(); + AI->dump(); + EnteringBB->getParent()->dump(); return true; } Index: test/Isl/CodeGen/phi_condition_modeling_1.ll =================================================================== --- /dev/null +++ test/Isl/CodeGen/phi_condition_modeling_1.ll @@ -0,0 +1,59 @@ +; RUN: opt %loadPolly -S -polly-no-early-exit -polly-detect-unprofitable -polly-model-phi-nodes -polly-codegen-isl < %s | FileCheck %s +; +; void f(int *A, int c, int N) { +; int tmp; +; for (int i = 0; i < N; i++) { +; if (i > c) +; tmp = 3; +; else +; tmp = 5; +; A[i] = tmp; +; } +; } +; +; CHECK-LABEL: bb: +; CHECK: %tmp.0.phiops = alloca i32 +; CHECK-LABEL: polly.stmt.bb8: +; CHECK: %tmp.0.phiops.reload = load i32, i32* %tmp.0.phiops +; CHECK: store i32 %tmp.0.phiops.reload, i32* +; CHECK-LABEL: polly.stmt.bb6: +; CHECK: store i32 3, i32* %tmp.0.phiops +; CHECK-LABEL: polly.stmt.bb7: +; CHECK: store i32 5, i32* %tmp.0.phiops + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* %A, i32 %c, i32 %N) { +bb: + %tmp = sext i32 %N to i64 + %tmp1 = sext i32 %c to i64 + br label %bb2 + +bb2: ; preds = %bb10, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb10 ], [ 0, %bb ] + %tmp3 = icmp slt i64 %indvars.iv, %tmp + br i1 %tmp3, label %bb4, label %bb11 + +bb4: ; preds = %bb2 + %tmp5 = icmp sgt i64 %indvars.iv, %tmp1 + br i1 %tmp5, label %bb6, label %bb7 + +bb6: ; preds = %bb4 + br label %bb8 + +bb7: ; preds = %bb4 + br label %bb8 + +bb8: ; preds = %bb7, %bb6 + %tmp.0 = phi i32 [ 3, %bb6 ], [ 5, %bb7 ] + %tmp9 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 %tmp.0, i32* %tmp9, align 4 + br label %bb10 + +bb10: ; preds = %bb8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb2 + +bb11: ; preds = %bb2 + ret void +} Index: test/Isl/CodeGen/phi_condition_modeling_2.ll =================================================================== --- /dev/null +++ test/Isl/CodeGen/phi_condition_modeling_2.ll @@ -0,0 +1,66 @@ +; RUN: opt %loadPolly -S -polly-no-early-exit -polly-detect-unprofitable -polly-model-phi-nodes -disable-polly-intra-scop-scalar-to-array -polly-codegen-isl < %s | FileCheck %s +; +; void f(int *A, int c, int N) { +; int tmp; +; for (int i = 0; i < N; i++) { +; if (i > c) +; tmp = 3; +; else +; tmp = 5; +; A[i] = tmp; +; } +; } +; +; CHECK-LABEL: bb: +; CHECK-DAG: %tmp.0.s2a = alloca i32 +; CHECK-DAG: %tmp.0.phiops = alloca i32 +; CHECK-LABEL: polly.stmt.bb8: +; CHECK: %tmp.0.phiops.reload = load i32, i32* %tmp.0.phiops +; CHECK: store i32 %tmp.0.phiops.reload, i32* %tmp.0.s2a +; CHECK-LABEL: polly.stmt.bb8b: +; CHECK: %tmp.0.s2a.reload = load i32, i32* %tmp.0.s2a +; CHECK: store i32 %tmp.0.s2a.reload, +; CHECK-LABEL: polly.stmt.bb6: +; CHECK: store i32 3, i32* %tmp.0.phiops +; CHECK-LABEL: polly.stmt.bb7: +; CHECK: store i32 5, i32* %tmp.0.phiops + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* %A, i32 %c, i32 %N) { +bb: + %tmp = sext i32 %N to i64 + %tmp1 = sext i32 %c to i64 + br label %bb2 + +bb2: ; preds = %bb10, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb10 ], [ 0, %bb ] + %tmp3 = icmp slt i64 %indvars.iv, %tmp + br i1 %tmp3, label %bb4, label %bb11 + +bb4: ; preds = %bb2 + %tmp5 = icmp sgt i64 %indvars.iv, %tmp1 + br i1 %tmp5, label %bb6, label %bb7 + +bb6: ; preds = %bb4 + br label %bb8 + +bb7: ; preds = %bb4 + br label %bb8 + +bb8: ; preds = %bb7, %bb6 + %tmp.0 = phi i32 [ 3, %bb6 ], [ 5, %bb7 ] + br label %bb8b + +bb8b: + %tmp9 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 %tmp.0, i32* %tmp9, align 4 + br label %bb10 + +bb10: ; preds = %bb8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb2 + +bb11: ; preds = %bb2 + ret void +} Index: test/Isl/CodeGen/phi_conditional_simple_1.ll =================================================================== --- /dev/null +++ test/Isl/CodeGen/phi_conditional_simple_1.ll @@ -0,0 +1,72 @@ +; RUN: opt %loadPolly -analyze -polly-ast -polly-no-early-exit -polly-detect-unprofitable -polly-model-phi-nodes < %s | FileCheck %s --check-prefix=AST +; RUN: opt %loadPolly -S -polly-no-early-exit -polly-detect-unprofitable -polly-model-phi-nodes -polly-codegen-isl < %s | FileCheck %s +; +; void jd(int *A, int c) { +; for (int i = 0; i < 1024; i++) { +; if (c) +; A[i] = 1; +; else +; A[i] = 2; +; } +; } + +; AST: for (int c0 = 0; c0 <= 1023; c0 += 1) { +; AST: if (c <= -1) { +; AST: Stmt_if_then(c0); +; AST: } else if (c >= 1) { +; AST: Stmt_if_then(c0); +; AST: } else +; AST: Stmt_if_else(c0); +; AST: Stmt_if_end(c0); +; AST: } +; +; CHECK-LABEL: entry: +; CHECK-NEXT: %phi.phiops = alloca i32 +; CHECK-LABEL: polly.stmt.if.end: +; CHECK-NEXT: %phi.phiops.reload = load i32, i32* %phi.phiops +; CHECK-NEXT: %scevgep +; CHECK-NEXT: store i32 %phi.phiops.reload, i32* +; CHECK-LABEL: polly.stmt.if.then: +; CHECK-NEXT: store i32 1, i32* %phi.phiops +; CHECK-NEXT: br label %polly.merge{{[.]?}} +; CHECK-LABEL: polly.stmt.if.then{{.}}: +; CHECK-NEXT: store i32 1, i32* %phi.phiops +; CHECK-NEXT: br label %polly.merge{{[.]?}} +; CHECK-LABEL: polly.stmt.if.else: +; CHECK-NEXT: store i32 2, i32* %phi.phiops +; CHECK-NEXT: br label %polly.merge{{[.]?}} +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @jd(i32* %A, i32 %c) { +entry: + br label %for.cond + +for.cond: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ] + %exitcond = icmp ne i64 %indvars.iv, 1024 + br i1 %exitcond, label %for.body, label %for.end + +for.body: + %tobool = icmp eq i32 %c, 0 + br i1 %tobool, label %if.else, label %if.then + +if.then: + br label %if.end + +if.else: + br label %if.end + +if.end: + %phi = phi i32 [ 1, %if.then], [ 2, %if.else ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 %phi, i32* %arrayidx, align 4 + br label %for.inc + +for.inc: + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.cond + +for.end: + ret void +} Index: test/Isl/CodeGen/phi_loop_carried_float.ll =================================================================== --- /dev/null +++ test/Isl/CodeGen/phi_loop_carried_float.ll @@ -0,0 +1,67 @@ +; RUN: opt %loadPolly -S -polly-no-early-exit -polly-detect-unprofitable -polly-model-phi-nodes -disable-polly-intra-scop-scalar-to-array -polly-codegen-isl < %s | FileCheck %s +; +; float f(float *A, int N) { +; float tmp = 0; +; for (int i = 0; i < N; i++) +; tmp += A[i]; +; } +; +; CHECK: bb: +; CHECK-NOT: %tmp7{{[.*]}} = alloca float +; CHECK-DAG: %tmp.0.s2a = alloca float +; CHECK-NOT: %tmp7{{[.*]}} = alloca float +; CHECK-DAG: %tmp.0.phiops = alloca float +; CHECK-NOT: %tmp7{{[.*]}} = alloca float +; +; CHECK: polly.merge_new_and_old: +; CHECK-NEXT: ret +; +; CHECK: polly.start: +; CHECK-NEXT: store float 0.000000e+00, float* %tmp.0.phiops + +; CHECK: polly.merge: +; CHECK-NEXT: br label %polly.merge_new_and_old + +; CHECK: polly.stmt.bb1{{[0-9]*}}: +; CHECK-NEXT: %tmp.0.phiops.reload[[R1:[0-9]*]] = load float, float* %tmp.0.phiops +; CHECK: store float %tmp.0.phiops.reload[[R1]], float* %tmp.0.s2a + +; CHECK: polly.stmt.bb1{{[0-9]*}}: +; CHECK-NEXT: %tmp.0.phiops.reload[[R2:[0-9]*]] = load float, float* %tmp.0.phiops +; CHECK: store float %tmp.0.phiops.reload[[R2]], float* %tmp.0.s2a + +; CHECK: polly.stmt.bb4: ; preds = %polly.then3 +; CHECK: %tmp[[R5:[0-9]*]]_p_scalar_ = load float, float* %scevgep, align 4, !alias.scope !0, !noalias !2 +; CHECK: %tmp.0.s2a.reload[[R3:[0-9]*]] = load float, float* %tmp.0.s2a +; CHECK: %p_tmp[[R4:[0-9]*]] = fadd float %tmp.0.s2a.reload[[R3]], %tmp[[R5]]_p_scalar_ +; CHECK: store float %p_tmp[[R4]], float* %tmp.0.phiops + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(float* %A, i32 %N) { +bb: + %tmp = sext i32 %N to i64 + br label %bb1 + +bb1: ; preds = %bb4, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb4 ], [ 0, %bb ] + %tmp.0 = phi float [ 0.000000e+00, %bb ], [ %tmp7, %bb4 ] + %tmp2 = icmp slt i64 %indvars.iv, %tmp + br i1 %tmp2, label %bb3, label %bb8 + +bb3: ; preds = %bb1 + br label %bb4 + +bb4: ; preds = %bb3 + %tmp5 = getelementptr inbounds float, float* %A, i64 %indvars.iv + %tmp6 = load float, float* %tmp5, align 4 + %tmp7 = fadd float %tmp.0, %tmp6 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb1 + +bb8: ; preds = %bb1 + br label %exit + +exit: + ret void +} Index: test/Isl/CodeGen/phi_loop_carried_float_escape.ll =================================================================== --- /dev/null +++ test/Isl/CodeGen/phi_loop_carried_float_escape.ll @@ -0,0 +1,63 @@ +; RUN: opt %loadPolly -S -polly-no-early-exit -polly-detect-unprofitable -polly-model-phi-nodes -disable-polly-intra-scop-scalar-to-array -polly-codegen-isl < %s | FileCheck %s +; +; float f(float *A, int N) { +; float tmp = 0; +; for (int i = 0; i < N; i++) +; tmp += A[i]; +; return tmp; +; } +; +; CHECK: polly.merge_new_and_old: +; CHECK-NEXT: %tmp.0.merge = phi float [ %tmp.0.final_reload, %polly.merge ], [ %tmp.0, %bb8 ] +; CHECK-NEXT: ret float %tmp.0.merge +; +; CHECK: polly.start: +; CHECK-NEXT: store float 0.000000e+00, float* %tmp.0.phiops + +; CHECK: polly.merge: +; CHECK-NEXT: %tmp.0.final_reload = load float, float* %tmp.0.s2a +; CHECK-NEXT: br label %polly.merge_new_and_old + +; CHECK: polly.stmt.bb1{{[0-9]*}}: +; CHECK-NEXT: %tmp.0.phiops.reload[[R1:[0-9]*]] = load float, float* %tmp.0.phiops +; CHECK-: store float %tmp.0.phiops.reload[[R1]], float* %tmp.0.s2a + +; CHECK: polly.stmt.bb1{{[0-9]*}}: +; CHECK-NEXT: %tmp.0.phiops.reload[[R2:[0-9]*]] = load float, float* %tmp.0.phiops +; CHECK: store float %tmp.0.phiops.reload[[R2]], float* %tmp.0.s2a + +; CHECK: polly.stmt.bb4: ; preds = %polly.then3 +; CHECK: %tmp[[R5:[0-9]*]]_p_scalar_ = load float, float* %scevgep, align 4, !alias.scope !0, !noalias !2 +; CHECK: %tmp.0.s2a.reload[[R3:[0-9]*]] = load float, float* %tmp.0.s2a +; CHECK: %p_tmp[[R4:[0-9]*]] = fadd float %tmp.0.s2a.reload[[R3]], %tmp[[R5]]_p_scalar_ +; CHECK: store float %p_tmp[[R4]], float* %tmp.0.phiops + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define float @f(float* %A, i32 %N) { +bb: + %tmp = sext i32 %N to i64 + br label %bb1 + +bb1: ; preds = %bb4, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb4 ], [ 0, %bb ] + %tmp.0 = phi float [ 0.000000e+00, %bb ], [ %tmp7, %bb4 ] + %tmp2 = icmp slt i64 %indvars.iv, %tmp + br i1 %tmp2, label %bb3, label %bb8 + +bb3: ; preds = %bb1 + br label %bb4 + +bb4: ; preds = %bb3 + %tmp5 = getelementptr inbounds float, float* %A, i64 %indvars.iv + %tmp6 = load float, float* %tmp5, align 4 + %tmp7 = fadd float %tmp.0, %tmp6 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb1 + +bb8: ; preds = %bb1 + br label %exit + +exit: + ret float %tmp.0 +} Index: test/Isl/CodeGen/phi_scalar_simple_1.ll =================================================================== --- /dev/null +++ test/Isl/CodeGen/phi_scalar_simple_1.ll @@ -0,0 +1,93 @@ +; RUN: opt %loadPolly -S -polly-detect-unprofitable -polly-model-phi-nodes -disable-polly-intra-scop-scalar-to-array -polly-no-early-exit -polly-codegen-isl < %s | FileCheck %s +; +; int jd(int *restrict A, int x, int N) { +; for (int i = 1; i < N; i++) +; for (int j = 3; j < N; j++) +; x += A[i]; +; return x; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define i32 @jd(i32* noalias %A, i32 %x, i32 %N) { +entry: +; CHECK-LABEL: entry: +; CHECK-DAG: %x.addr.1.lcssa.s2a = alloca i32 +; CHECK-DAG: %x.addr.1.lcssa.phiops = alloca i32 +; CHECK-DAG: %x.addr.1.s2a = alloca i32 +; CHECK-DAG: %x.addr.1.phiops = alloca i32 +; CHECK-DAG: %x.addr.0.s2a = alloca i32 +; CHECK-DAG: %x.addr.0.phiops = alloca i32 + %tmp = sext i32 %N to i64 + br label %for.cond + +; CHECK-LABEL: polly.merge_new_and_old: +; CHECK: %x.addr.0.merge = phi i32 [ %x.addr.0.final_reload, %polly.merge ], [ %x.addr.0, %for.cond ] +; CHECK: ret i32 %x.addr.0.merge + +; CHECK-LABEL: polly.start: +; CHECK-NEXT: store i32 %x, i32* %x.addr.0.phiops + +; CHECK-LABEL: polly.merge: +; CHECK: %x.addr.0.final_reload = load i32, i32* %x.addr.0.s2a + +for.cond: ; preds = %for.inc4, %entry +; CHECK-LABEL: polly.stmt.for.cond{{[0-9]*}}: +; CHECK: %x.addr.0.phiops.reload[[R1:[0-9]*]] = load i32, i32* %x.addr.0.phiops +; CHECK: store i32 %x.addr.0.phiops.reload[[R1]], i32* %x.addr.0.s2a + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc4 ], [ 1, %entry ] + %x.addr.0 = phi i32 [ %x, %entry ], [ %x.addr.1.lcssa, %for.inc4 ] + %cmp = icmp slt i64 %indvars.iv, %tmp + br i1 %cmp, label %for.body, label %for.end6 + +; CHECK-LABEL: polly.stmt.for.cond{{[0-9]*}}: +; CHECK: %x.addr.0.phiops.reload[[R1:[0-9]*]] = load i32, i32* %x.addr.0.phiops +; CHECK: store i32 %x.addr.0.phiops.reload[[R1]], i32* %x.addr.0.s2a + +for.body: ; preds = %for.cond +; CHECK-LABEL: polly.stmt.for.body: +; CHECK: %x.addr.0.s2a.reload[[R2:[0-9]*]] = load i32, i32* %x.addr.0.s2a +; CHECK: store i32 %x.addr.0.s2a.reload[[R2]], i32* %x.addr.1.phiops + br label %for.cond1 + +for.end: ; preds = %for.cond1 +; CHECK-LABEL: polly.stmt.for.end: +; CHECK-NEXT: %x.addr.1.lcssa.phiops.reload = load i32, i32* %x.addr.1.lcssa.phiops +; CHECK-NEXT: store i32 %x.addr.1.lcssa.phiops.reload, i32* %x.addr.1.lcssa.s2a[[R4:[0-9]*]] + %x.addr.1.lcssa = phi i32 [ %x.addr.1, %for.cond1 ] + br label %for.inc4 + +for.inc4: ; preds = %for.end +; CHECK-LABEL: polly.stmt.for.inc4: +; CHECK: %x.addr.1.lcssa.s2a.reload[[R5:[0-9]*]] = load i32, i32* %x.addr.1.lcssa.s2a[[R4]] +; CHECK: store i32 %x.addr.1.lcssa.s2a.reload[[R5]], i32* %x.addr.0.phiops + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.cond + +for.cond1: ; preds = %for.inc, %for.body +; CHECK-LABEL: polly.stmt.for.cond1: +; CHECK: %x.addr.1.phiops.reload = load i32, i32* %x.addr.1.phiops +; CHECK: store i32 %x.addr.1.phiops.reload, i32* %x.addr.1.s2a[[R6:[0-9]*]] +; CHECK: store i32 %x.addr.1.phiops.reload, i32* %x.addr.1.lcssa.phiops + %x.addr.1 = phi i32 [ %x.addr.0, %for.body ], [ %add, %for.inc ] + %j.0 = phi i32 [ 3, %for.body ], [ %inc, %for.inc ] + %exitcond = icmp ne i32 %j.0, %N + br i1 %exitcond, label %for.body3, label %for.end + +for.body3: ; preds = %for.cond1 + br label %for.inc + +for.inc: ; preds = %for.body3 +; CHECK-LABEL: polly.stmt.for.inc: +; CHECK: %x.addr.1.s2a.reload[[R3:[0-9]*]] = load i32, i32* %x.addr.1.s2a +; CHECK: %p_add = add nsw i32 %x.addr.1.s2a.reload[[R3]], %tmp1_p_scalar_ +; CHECK: store i32 %p_add, i32* %x.addr.1.phiops + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp1 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %x.addr.1, %tmp1 + %inc = add nsw i32 %j.0, 1 + br label %for.cond1 + +for.end6: ; preds = %for.cond + ret i32 %x.addr.0 +} Index: test/Isl/CodeGen/phi_scalar_simple_2.ll =================================================================== --- /dev/null +++ test/Isl/CodeGen/phi_scalar_simple_2.ll @@ -0,0 +1,108 @@ +; RUN: opt %loadPolly -S -polly-detect-unprofitable -polly-model-phi-nodes -disable-polly-intra-scop-scalar-to-array -polly-no-early-exit -polly-codegen-isl < %s | FileCheck %s +; +; int jd(int *restrict A, int x, int N, int c) { +; for (int i = 0; i < N; i++) +; for (int j = 0; j < N; j++) +; if (i < c) +; x += A[i]; +; return x; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define i32 @jd(i32* noalias %A, i32 %x, i32 %N, i32 %c) { +entry: +; CHECK-LABEL: entry: +; CHECK-DAG: %x.addr.2.s2a = alloca i32 +; CHECK-DAG: %x.addr.2.phiops = alloca i32 +; CHECK-DAG: %x.addr.1.s2a = alloca i32 +; CHECK-DAG: %x.addr.1.phiops = alloca i32 +; CHECK-DAG: %x.addr.0.s2a = alloca i32 +; CHECK-DAG: %x.addr.0.phiops = alloca i32 + %tmp = sext i32 %N to i64 + %tmp1 = sext i32 %c to i64 + br label %for.cond + +; CHECK-LABEL: polly.merge_new_and_old: +; CHECK: %x.addr.0.merge = phi i32 [ %x.addr.0.final_reload, %polly.merge ], [ %x.addr.0, %for.cond ] +; CHECK: ret i32 %x.addr.0.merge + +; CHECK-LABEL: polly.start: +; CHECK-NEXT: store i32 %x, i32* %x.addr.0.phiops + +; CHECK-LABEL: polly.merge: +; CHECK: %x.addr.0.final_reload = load i32, i32* %x.addr.0.s2a + +for.cond: ; preds = %for.inc5, %entry +; CHECK-LABEL: polly.stmt.for.cond{{[0-9]*}}: +; CHECK: %x.addr.0.phiops.reload[[R1:[0-9]*]] = load i32, i32* %x.addr.0.phiops +; CHECK: store i32 %x.addr.0.phiops.reload[[R1]], i32* %x.addr.0.s2a + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc5 ], [ 0, %entry ] + %x.addr.0 = phi i32 [ %x, %entry ], [ %x.addr.1, %for.inc5 ] + %cmp = icmp slt i64 %indvars.iv, %tmp + br i1 %cmp, label %for.body, label %for.end7 + +; CHECK-LABEL: polly.stmt.for.cond{{[0-9]*}}: +; CHECK: %x.addr.0.phiops.reload[[R1:[0-9]*]] = load i32, i32* %x.addr.0.phiops +; CHECK: store i32 %x.addr.0.phiops.reload[[R1]], i32* %x.addr.0.s2a + +for.body: ; preds = %for.cond +; CHECK-LABEL: polly.stmt.for.body: +; CHECK: %x.addr.0.s2a.reload[[R2:[0-9]*]] = load i32, i32* %x.addr.0.s2a +; CHECK: store i32 %x.addr.0.s2a.reload[[R2]], i32* %x.addr.1.phiops + br label %for.cond1 + +for.inc5: ; preds = %for.end +; CHECK-LABEL: polly.stmt.for.inc5: +; CHECK: %x.addr.1.s2a.reload[[R5:[0-9]*]] = load i32, i32* %x.addr.1.s2a +; CHECK: store i32 %x.addr.1.s2a.reload[[R5]], i32* %x.addr.0.phiops + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.cond + +for.cond1: ; preds = %for.inc, %for.body +; CHECK-LABEL: polly.stmt.for.cond1: +; CHECK: %x.addr.1.phiops.reload = load i32, i32* %x.addr.1.phiops +; CHECK: store i32 %x.addr.1.phiops.reload, i32* %x.addr.1.s2a + %x.addr.1 = phi i32 [ %x.addr.0, %for.body ], [ %x.addr.2, %for.inc ] + %j.0 = phi i32 [ 0, %for.body ], [ %inc, %for.inc ] + %exitcond = icmp ne i32 %j.0, %N + br i1 %exitcond, label %for.body3, label %for.end + +for.body3: ; preds = %for.cond1 +; CHECK-LABEL: polly.stmt.for.body3: +; CHECK: %x.addr.1.s2a.reload = load i32, i32* %x.addr.1.s2a +; CHECK: store i32 %x.addr.1.s2a.reload, i32* %x.addr.2.phiops + %cmp4 = icmp slt i64 %indvars.iv, %tmp1 + br i1 %cmp4, label %if.then, label %if.end + +if.end: ; preds = %if.then, %for.body3 +; CHECK-LABEL: polly.stmt.if.end: +; CHECK: %x.addr.2.phiops.reload = load i32, i32* %x.addr.2.phiops +; CHECK: store i32 %x.addr.2.phiops.reload, i32* %x.addr.2.s2a + %x.addr.2 = phi i32 [ %add, %if.then ], [ %x.addr.1, %for.body3 ] + br label %for.inc + +for.inc: ; preds = %if.end +; CHECK-LABEL: polly.stmt.for.inc: +; CHECK: %x.addr.2.s2a.reload[[R3:[0-9]*]] = load i32, i32* %x.addr.2.s2a +; CHECK: store i32 %x.addr.2.s2a.reload[[R3]], i32* %x.addr.1.phiops + %inc = add nsw i32 %j.0, 1 + br label %for.cond1 + +if.then: ; preds = %for.body3 +; CHECK-LABEL: polly.stmt.if.then: +; CHECK: %x.addr.1.s2a.reload[[R5:[0-9]*]] = load i32, i32* %x.addr.1.s2a +; CHECK: %p_add = add nsw i32 %x.addr.1.s2a.reload[[R5]], %tmp2_p_scalar_ +; CHECK: store i32 %p_add, i32* %x.addr.2.phiops + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp2 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %x.addr.1, %tmp2 + br label %if.end + +for.end: ; preds = %for.cond1 + br label %for.inc5 + +for.end7: ; preds = %for.cond + ret i32 %x.addr.0 +} +