Index: include/polly/CodeGen/BlockGenerators.h =================================================================== --- include/polly/CodeGen/BlockGenerators.h +++ include/polly/CodeGen/BlockGenerators.h @@ -92,6 +92,15 @@ /// @param LTS A map from old loops to new induction variables as SCEVs. void copyBB(ScopStmt &Stmt, ValueMapT &GlobalMap, LoopToScevMapT <S); + /// @brief Finalize the code generation for the SCoP @p S. + /// + /// This will initialize and finalize the scalar variables we demoted during + /// the code generation. + /// + /// @see createScalarInitialization(Region &) + /// @see createScalarFinalization(Region &) + void finalizeSCoP(Scop &S); + protected: PollyIRBuilder &Builder; Pass *P; @@ -99,6 +108,94 @@ ScalarEvolution &SE; IslExprBuilder *ExprBuilder; + /// @brief The entry block of the current function. + BasicBlock &EntryBB; + + /// @brief Map type to resolve scalar dependences on the fly. + using ScalarAllocaMapTy = DenseMap; + + /// @brief Maps to resolve scalar dependences for phi operands and scalars. + /// + ///{ + ScalarAllocaMapTy PHIOpMap; + ScalarAllocaMapTy ScalarMap; + ///} + + /// @brief Simple vector of instructions to store escape users. + using EscapeUserVectorTy = SmallVector; + + /// @brief Map from instructions to their escape users as well as the alloca. + DenseMap> + EscapeMap; + + /// @brief Return the alloca for @p ScalarBase in @p Map. + /// + /// If no alloca was mapped to @p ScalarBase in @p Map a new one is created + /// and named after @p ScalarBase with the suffix @p NameExt. + /// + /// @param ScalarBase The demoted scalar instruction. + /// @param Map The map we should look for a mapped alloca instruction. + /// @param NameExt The suffix we add to the name of a new created alloca. + /// @param IsNew If set it will hold true iff the alloca was created. + /// + /// @returns The alloca for @p ScalarBase in @p Map. + AllocaInst *getOrCreateAlloca(Instruction *ScalarBase, ScalarAllocaMapTy &Map, + const char *NameExt = ".s2a", + bool *IsNew = nullptr); + + /// @brief Generate the load, store and mapping for a demoted phi node. + /// + /// This function will be called for each demoted phi node when we copy the + /// block the phi was in. The phi was formerly modeled there as a read and + /// a write access. This function can be called for either of those but not + /// both. It will load the demoted phi operand and store it in the alloca + /// designated to this phi. It will also map the phi to its current value in + /// this block. + /// + /// @param ScalarBasePHI The phi node we demoted. + /// @param BBMap The mapping of old to new values for this block. + void generateScalarPHI(PHINode *ScalarBasePHI, ValueMapT &BBMap); + + /// @brief Generate reload of scalars demoted to memory and needed by @p Inst. + /// + /// @param Stmt The statement we generate code for. + /// @param Inst The instruction that might need reloaded values. + /// @param BBMap A mapping from old values to their new values in this block. + void generateScalarLoads(ScopStmt &Stmt, const Instruction *Inst, + ValueMapT &BBMap); + + /// @brief Generate the scalar stores for the given statement. + /// + /// After the statement @p Stmt was copied all inner-SCoP scalar dependences + /// starting in @p Stmt (hence all scalar write accesses in @p Stmt) need to + /// be demoted to memory. + /// + /// @param Stmt The statement we generate code for. + /// @param BBMap A mapping from old values to their new values in this block. + void generateScalarStores(ScopStmt &Stmt, ValueMapT &BBMAp); + + /// @brief Handle users of @p Inst outside the SCoP. + /// + /// @param R The current SCoP region. + /// @param Inst The current instruction we check. + /// @param InstCopy The copy of the instruction @p Inst in the optimized SCoP. + void handleOutsideUsers(const Region &R, Instruction *Inst, Value *InstCopy); + + /// @brief Initialize the memory of demoted scalars. + /// + /// If a PHI node was demoted and one of its predecessor blocks was outside + /// the SCoP we need to initialize the memory cell we demoted the PHI into + /// with the value corresponding to that predecessor. As a SCoP is a + /// __single__ entry region there is at most one such predecessor. + void createScalarInitialization(Region &R); + + /// @brief Promote the values of demoted scalars after the SCoP. + /// + /// If a scalar value was used outside the SCoP we need to promote the value + /// stored in the memory cell allocated for that scalar and combine it with + /// the original value in the non-optimized SCoP. + void createScalarFinalization(Region &R); + /// @brief Get the new version of a value. /// /// Given an old value, we first check if a new version of this value is @@ -129,8 +226,9 @@ Value *getNewValue(ScopStmt &Stmt, const Value *Old, ValueMapT &BBMap, ValueMapT &GlobalMap, LoopToScevMapT <S, Loop *L) const; - void copyInstScalar(ScopStmt &Stmt, const Instruction *Inst, ValueMapT &BBMap, - ValueMapT &GlobalMap, LoopToScevMapT <S); + Instruction *copyInstScalar(ScopStmt &Stmt, const Instruction *Inst, + ValueMapT &BBMap, ValueMapT &GlobalMap, + LoopToScevMapT <S); /// @brief Get the innermost loop that surrounds an instruction. /// @@ -146,13 +244,13 @@ const Value *Pointer, ValueMapT &BBMap, ValueMapT &GlobalMap, LoopToScevMapT <S); - Value *generateScalarLoad(ScopStmt &Stmt, const LoadInst *load, - ValueMapT &BBMap, ValueMapT &GlobalMap, - LoopToScevMapT <S); + Instruction *generateScalarLoad(ScopStmt &Stmt, const LoadInst *load, + ValueMapT &BBMap, ValueMapT &GlobalMap, + LoopToScevMapT <S); - Value *generateScalarStore(ScopStmt &Stmt, const StoreInst *store, - ValueMapT &BBMap, ValueMapT &GlobalMap, - LoopToScevMapT <S); + Instruction *generateScalarStore(ScopStmt &Stmt, const StoreInst *store, + ValueMapT &BBMap, ValueMapT &GlobalMap, + LoopToScevMapT <S); /// @brief Copy a single Instruction. /// @@ -166,13 +264,11 @@ /// @param GlobalMap A mapping from old values to their new values /// (for values recalculated in the new ScoP, but not /// within this basic block). - /// @param LTS A mapping from loops virtual canonical induction - /// variable to their new values - /// (for values recalculated in the new ScoP, but not - /// within this basic block). - void copyInstruction(ScopStmt &Stmt, const Instruction *Inst, - ValueMapT &BBMap, ValueMapT &GlobalMap, - LoopToScevMapT <S); + /// + /// @returns The copied instruction or nullptr if none excists. + Value *copyInstruction(ScopStmt &Stmt, const Instruction *Inst, + ValueMapT &BBMap, ValueMapT &GlobalMap, + LoopToScevMapT <S); }; /// @brief Generate a new vector basic block for a polyhedral statement. Index: include/polly/ScopInfo.h =================================================================== --- include/polly/ScopInfo.h +++ include/polly/ScopInfo.h @@ -207,6 +207,12 @@ /// Updated access relation read from JSCOP file. isl_map *newAccessRelation; + /// @brief Other memory effects which are linked to this one. + /// + /// If an instruction can cause multiple memory accesses or effects they are + /// linked together (linked list fashion). + MemoryAccess *NextMA; + void assumeNoOutOfBound(const IRAccess &Access); /// @brief Get the original access function as read from IR. @@ -333,6 +339,12 @@ /// @brief Align the parameters in the access relation to the scop context void realignParams(); + /// @brief Set the next MemoryAccess (linked list fashion). + void setNextMA(MemoryAccess *MA) { NextMA = MA; } + + /// @brief Get the next MemoryAccess (linked list fashion). + MemoryAccess *getNextMA() const { return NextMA; } + /// @brief Print the MemoryAccess. /// /// @param OS The output stream the MemoryAccess is printed to. Index: lib/Analysis/ScopInfo.cpp =================================================================== --- lib/Analysis/ScopInfo.cpp +++ lib/Analysis/ScopInfo.cpp @@ -486,7 +486,7 @@ MemoryAccess::MemoryAccess(const IRAccess &Access, Instruction *AccInst, ScopStmt *Statement, const ScopArrayInfo *SAI) : AccType(getMemoryAccessType(Access)), Statement(Statement), Inst(AccInst), - newAccessRelation(nullptr) { + newAccessRelation(nullptr), NextMA(nullptr) { isl_ctx *Ctx = Statement->getIslCtx(); BaseAddr = Access.getBase(); @@ -728,16 +728,10 @@ MemAccs.push_back(new MemoryAccess(Access, AccessInst, this, SAI)); - // We do not track locations for scalar memory accesses at the moment. - // - // We do not have a use for this information at the moment. If we need this - // at some point, the "instruction -> access" mapping needs to be enhanced - // as a single instruction could then possibly perform multiple accesses. - if (!Access.isScalar()) { - assert(!InstructionToAccess.count(AccessInst) && - "Unexpected 1-to-N mapping on instruction to access map!"); - InstructionToAccess[AccessInst] = MemAccs.back(); - } + MemoryAccess *&MA = InstructionToAccess[AccessInst]; + if (MA) + MemAccs.back()->setNextMA(MA); + MA = MemAccs.back(); } } Index: lib/CodeGen/BlockGenerators.cpp =================================================================== --- lib/CodeGen/BlockGenerators.cpp +++ lib/CodeGen/BlockGenerators.cpp @@ -78,7 +78,8 @@ BlockGenerator::BlockGenerator(PollyIRBuilder &B, Pass *P, LoopInfo &LI, ScalarEvolution &SE, IslExprBuilder *ExprBuilder) - : Builder(B), P(P), LI(LI), SE(SE), ExprBuilder(ExprBuilder) {} + : Builder(B), P(P), LI(LI), SE(SE), ExprBuilder(ExprBuilder), + EntryBB(B.GetInsertBlock()->getParent()->getEntryBlock()) {} Value *BlockGenerator::getNewValue(ScopStmt &Stmt, const Value *Old, ValueMapT &BBMap, ValueMapT &GlobalMap, @@ -110,7 +111,6 @@ SCEVExpander Expander(SE, "polly"); Value *Expanded = Expander.expandCodeFor(NewScev, Old->getType(), Builder.GetInsertPoint()); - BBMap[Old] = Expanded; return Expanded; } @@ -130,14 +130,16 @@ return nullptr; } -void BlockGenerator::copyInstScalar(ScopStmt &Stmt, const Instruction *Inst, - ValueMapT &BBMap, ValueMapT &GlobalMap, - LoopToScevMapT <S) { +Instruction *BlockGenerator::copyInstScalar(ScopStmt &Stmt, + const Instruction *Inst, + ValueMapT &BBMap, + ValueMapT &GlobalMap, + LoopToScevMapT <S) { // We do not generate debug intrinsics as we did not investigate how to // copy them correctly. At the current state, they just crash the code // generation as the meta-data operands are not correctly copied. if (isa(Inst)) - return; + return nullptr; Instruction *NewInst = Inst->clone(); @@ -150,7 +152,7 @@ assert(!isa(NewInst) && "Store instructions are always needed!"); delete NewInst; - return; + return nullptr; } NewInst->replaceUsesOfWith(OldOperand, NewOperand); @@ -161,6 +163,8 @@ if (!NewInst->getType()->isVoidTy()) NewInst->setName("p_" + Inst->getName()); + + return NewInst; } Value *BlockGenerator::getNewAccessOperand(ScopStmt &Stmt, @@ -201,61 +205,71 @@ return LI.getLoopFor(Inst->getParent()); } -Value *BlockGenerator::generateScalarLoad(ScopStmt &Stmt, const LoadInst *Load, - ValueMapT &BBMap, - ValueMapT &GlobalMap, - LoopToScevMapT <S) { +Instruction *BlockGenerator::generateScalarLoad(ScopStmt &Stmt, + const LoadInst *Load, + ValueMapT &BBMap, + ValueMapT &GlobalMap, + LoopToScevMapT <S) { const Value *Pointer = Load->getPointerOperand(); Value *NewPointer = generateLocationAccessed(Stmt, Load, Pointer, BBMap, GlobalMap, LTS); - Value *ScalarLoad = Builder.CreateAlignedLoad( + Instruction *ScalarLoad = Builder.CreateAlignedLoad( NewPointer, Load->getAlignment(), Load->getName() + "_p_scalar_"); return ScalarLoad; } -Value *BlockGenerator::generateScalarStore(ScopStmt &Stmt, - const StoreInst *Store, - ValueMapT &BBMap, - ValueMapT &GlobalMap, - LoopToScevMapT <S) { +Instruction *BlockGenerator::generateScalarStore(ScopStmt &Stmt, + const StoreInst *Store, + ValueMapT &BBMap, + ValueMapT &GlobalMap, + LoopToScevMapT <S) { const Value *Pointer = Store->getPointerOperand(); Value *NewPointer = generateLocationAccessed(Stmt, Store, Pointer, BBMap, GlobalMap, LTS); Value *ValueOperand = getNewValue(Stmt, Store->getValueOperand(), BBMap, GlobalMap, LTS, getLoopForInst(Store)); - Value *NewStore = Builder.CreateAlignedStore(ValueOperand, NewPointer, - Store->getAlignment()); + Instruction *NewStore = Builder.CreateAlignedStore(ValueOperand, NewPointer, + Store->getAlignment()); return NewStore; } -void BlockGenerator::copyInstruction(ScopStmt &Stmt, const Instruction *Inst, - ValueMapT &BBMap, ValueMapT &GlobalMap, - LoopToScevMapT <S) { +Value *BlockGenerator::copyInstruction(ScopStmt &Stmt, const Instruction *Inst, + ValueMapT &BBMap, ValueMapT &GlobalMap, + LoopToScevMapT <S) { + + // First check for possible scalar dependences for this instruction. + generateScalarLoads(Stmt, Inst, BBMap); + // Terminator instructions control the control flow. They are explicitly // expressed in the clast and do not need to be copied. if (Inst->isTerminator()) - return; + return nullptr; if (canSynthesize(Inst, &LI, &SE, &Stmt.getParent()->getRegion())) - return; + return getNewValue(Stmt, Inst, BBMap, GlobalMap, LTS, getLoopForInst(Inst)); if (const LoadInst *Load = dyn_cast(Inst)) { - Value *NewLoad = generateScalarLoad(Stmt, Load, BBMap, GlobalMap, LTS); + Instruction *NewLoad = + generateScalarLoad(Stmt, Load, BBMap, GlobalMap, LTS); // Compute NewLoad before its insertion in BBMap to make the insertion // deterministic. BBMap[Load] = NewLoad; - return; + return NewLoad; } if (const StoreInst *Store = dyn_cast(Inst)) { - Value *NewStore = generateScalarStore(Stmt, Store, BBMap, GlobalMap, LTS); + Instruction *NewStore = + generateScalarStore(Stmt, Store, BBMap, GlobalMap, LTS); // Compute NewStore before its insertion in BBMap to make the insertion // deterministic. BBMap[Store] = NewStore; - return; + return NewStore; } + if (isa(Inst)) + return nullptr; + // Skip some special intrinsics for which we do not adjust the semantics to // the new schedule. All others are handled like every other instruction. if (auto *IT = dyn_cast(Inst)) { @@ -273,14 +287,14 @@ case llvm::Intrinsic::donothing: case llvm::Intrinsic::assume: case llvm::Intrinsic::expect: - return; + return nullptr; default: // Other intrinsics are copied. break; } } - copyInstScalar(Stmt, Inst, BBMap, GlobalMap, LTS); + return copyInstScalar(Stmt, Inst, BBMap, GlobalMap, LTS); } void BlockGenerator::copyBB(ScopStmt &Stmt, ValueMapT &GlobalMap, @@ -295,9 +309,302 @@ Builder.SetInsertPoint(CopyBB->begin()); ValueMapT BBMap; + const Region &R = Stmt.getParent()->getRegion(); + for (Instruction &Inst : *BB) { + Value *InstCopy = copyInstruction(Stmt, &Inst, BBMap, GlobalMap, LTS); + + if (InstCopy) { + Value *&InstMapped = BBMap[&Inst]; + assert((InstMapped == nullptr || InstMapped == InstCopy) && + "Bad mapping!"); + InstMapped = InstCopy; + } + } + + // After a basic block was copied store all scalars that escape this block + // in their alloca. First the scalars that have dependences inside the SCoP, + // then the ones that might escape the SCoP. + generateScalarStores(Stmt, BBMap); for (Instruction &Inst : *BB) - copyInstruction(Stmt, &Inst, BBMap, GlobalMap, LTS); + handleOutsideUsers(R, &Inst, BBMap[&Inst]); +} + +AllocaInst *BlockGenerator::getOrCreateAlloca(Instruction *ScalarBase, + ScalarAllocaMapTy &Map, + const char *NameExt, + bool *IsNew) { + + // Check if an alloca was cached for the base instruction. + AllocaInst *&Addr = Map[ScalarBase]; + + // If needed indicate if it was found already or will be created. + if (IsNew) + *IsNew = (Addr == nullptr); + + // If no alloca was found create one and insert it in the entry block. + if (!Addr) { + auto *Ty = ScalarBase->getType(); + Addr = new AllocaInst(Ty, ScalarBase->getName() + NameExt); + Addr->insertBefore(EntryBB.getFirstInsertionPt()); + } + + return Addr; +} + +void BlockGenerator::handleOutsideUsers(const Region &R, Instruction *Inst, + Value *InstCopy) { + AllocaInst *ScalarAddr = nullptr; + + // Check if there exists an escape alloca for this instruction. + auto EscapeMapEntry = EscapeMap.find(Inst); + if (EscapeMapEntry != EscapeMap.end()) + ScalarAddr = EscapeMapEntry->getSecond().first; + + // If not check if there are escaping uses and if so create the escape alloca. + if (!ScalarAddr) { + EscapeUserVectorTy EscapeUsers; + for (User *U : Inst->users()) { + + // Non-instruction user will never escape. + Instruction *UI = dyn_cast(U); + if (!UI) + continue; + + if (R.contains(UI)) + continue; + + EscapeUsers.push_back(UI); + } + + // Exit if no escape uses were found. + if (EscapeUsers.empty()) + return; + + // Get or create an escape alloca for this instruction. + bool IsNew; + ScalarAddr = getOrCreateAlloca(Inst, ScalarMap, ".escape", &IsNew); + + // Remember that this instruction has escape uses and the escape alloca. + EscapeMap[Inst] = std::make_pair(ScalarAddr, std::move(EscapeUsers)); + + // If the escape alloca was just created store the instruction in there, + // otherwise that happened already. + if (IsNew) { + assert(InstCopy && "Except PHIs every instruction should have a copy!"); + Builder.CreateStore(InstCopy, ScalarAddr); + } + } +} + +void BlockGenerator::generateScalarPHI(PHINode *ScalarBasePHI, + ValueMapT &BBMap) { + + AllocaInst *PHIOpAddr = getOrCreateAlloca(ScalarBasePHI, PHIOpMap, ".phiops"); + LoadInst *LI = + Builder.CreateLoad(PHIOpAddr, ScalarBasePHI->getName() + ".reload"); + + AllocaInst *PHIAddr = getOrCreateAlloca(ScalarBasePHI, ScalarMap); + Builder.CreateStore(LI, PHIAddr); + + BBMap[ScalarBasePHI] = LI; +} + +void BlockGenerator::generateScalarLoads(ScopStmt &Stmt, + const Instruction *Inst, + ValueMapT &BBMap) { + + // Iterate over all memory accesses for the given instruction and handle all + // scalar reads. + if (MemoryAccess *MA = Stmt.lookupAccessFor(Inst)) { + do { + if (!MA->isScalar() || !MA->isRead()) + continue; + + Instruction *ScalarBase = cast(MA->getBaseAddr()); + Instruction *ScalarInst = MA->getAccessInstruction(); + + PHINode *ScalarBasePHI = dyn_cast(ScalarBase); + + if (ScalarBasePHI == ScalarInst) { + // Only phi nodes read themselves and these reads are treated + // differently since we need to read the incoming operand and move it to + // phi alloca. + generateScalarPHI(ScalarBasePHI, BBMap); + } else { + // For non-phi loads we look up the alloca in the ScalarMap, reload it + // and add the mapping to the ones in the current basic block. + AllocaInst *ScalarAddr = getOrCreateAlloca(ScalarBase, ScalarMap); + LoadInst *LI = + Builder.CreateLoad(ScalarAddr, ScalarBase->getName() + ".reload"); + BBMap[ScalarBase] = LI; + } + + } while ((MA = MA->getNextMA())); + } +} + +void BlockGenerator::generateScalarStores(ScopStmt &Stmt, ValueMapT &BBMap) { + const Region &R = Stmt.getParent()->getRegion(); + BasicBlock *BB = Stmt.getBasicBlock(); + + // Iterate over all accesses in the given statement. + for (auto MI = Stmt.begin(), ME = Stmt.end(); MI != ME; MI++) { + MemoryAccess *MA = *MI; + + // Skip non-scalar and read accesses. + if (!MA->isScalar() || MA->isRead()) + continue; + + Instruction *ScalarBase = cast(MA->getBaseAddr()); + Instruction *ScalarInst = MA->getAccessInstruction(); + PHINode *ScalarBasePHI = dyn_cast(ScalarBase); + int ScalarBasePHIIdx = + ScalarBasePHI ? ScalarBasePHI->getBasicBlockIndex(BB) : -1; + + // Skip the artifical self write access of phi nodes as they are handled + // when the read access is treated. Also see generateScalarPHI(...). + // However, do not skip real loop carried phis, that is if the block + // actually writes the operand of the phi again. + if (ScalarBasePHI && ScalarBase == ScalarInst && + ScalarBase->getParent() == BB && ScalarBasePHIIdx < 0) + continue; + + // Get the alloca node for the base instruction. If the base is a phi get + // the operand alloca otherwise the regular scalar alloca. + AllocaInst *ScalarAddr = + getOrCreateAlloca(ScalarBase, ScalarBasePHI ? PHIOpMap : ScalarMap); + + // Get the new value of the scalar instruction and store it in the + // alloca node. For non-instructions we just take the value, for + // instructions mapped in the current block we use the new version of the + // value, for all others we have to reload them first. + Value *ScalarValue = ScalarInst; + + // In a first step recognize phi operand stores and get the right value + // from the phi operand list. + // TODO: This is necessary since we cannot store values in MemoryAccesses. + // if (ScalarInst->getParent() != BB) { + if (ScalarBasePHI) { + assert(ScalarBasePHI && "Assumed scalar phi operand write access."); + + // If the phi was split out of the region prior to code generation (@see + // executeScopConditionally) we use the phi itself as value (it dominates + // the region now). + // TODO: This is an artifact we want to avoid by splitting the entry edge + // in the beginning. + if (!R.contains(ScalarBase->getParent())) { + Builder.CreateStore(ScalarBase, ScalarAddr); + continue; + } + + assert(ScalarBasePHIIdx >= 0 && "Bad scalar phi operand write access."); + ScalarValue = ScalarBasePHI->getIncomingValue(ScalarBasePHIIdx); + } + + // Now check if: + // - The ScalarValue was a non-instruction operand of a phi or not part of + // the SCoP: + // - If so, store the ScalarValue in the ScalarAddr. + // - Otherwise, check the the next point. + // - The ScalarValue instruction was mapped in the current basic block: + // - If so, store the new version of the ScalarValue. + // - Otherwise, reload the ScalarValue and store the reloaded value. + Instruction *ScalarValueInst = dyn_cast(ScalarValue); + if (ScalarValueInst && R.contains(ScalarValueInst)) { + auto BBMapIt = BBMap.find(ScalarValue); + if (BBMapIt != BBMap.end()) { + ScalarValue = BBMapIt->getSecond(); + } else { + assert(ScalarMap.count(ScalarValueInst) && + "ScalarInst not mapped in the block and not in the scalar map!"); + ScalarValue = + Builder.CreateLoad(ScalarMap[ScalarValueInst], ".phiop_reload"); + } + } + + assert(ScalarValue && + "Scalar instruction should have been remapped in this basic block!"); + Builder.CreateStore(ScalarValue, ScalarAddr); + } +} + +void BlockGenerator::createScalarInitialization(Region &R) { + // The split block __just before__ the region and optimized region. + BasicBlock *SplitBB = R.getEnteringBlock(); + BranchInst *SplitBBTerm = cast(SplitBB->getTerminator()); + assert(SplitBBTerm->getNumSuccessors() == 2 && "Bad region entering block!"); + + // Get the start block of the __optimized__ region. + BasicBlock *StartBB = SplitBBTerm->getSuccessor(0); + if (StartBB == R.getEntry()) + StartBB = SplitBBTerm->getSuccessor(1); + + // For each phi predecessor outside the region store the incoming operand + // value prior to entering the optimized region. + Builder.SetInsertPoint(StartBB->getTerminator()); + + for (const auto &PHIOpMapping : PHIOpMap) { + const PHINode *PHI = cast(PHIOpMapping.getFirst()); + + // Check if this PHI has the split block as predecessor (that is the only + // possible predecessor outside the SCoP). + int idx = PHI->getBasicBlockIndex(SplitBB); + if (idx < 0) + continue; + + // If the split block is the predecessor initialize the phi operator alloca. + Builder.CreateStore(PHI->getIncomingValue(idx), PHIOpMapping.getSecond()); + } +} + +void BlockGenerator::createScalarFinalization(Region &R) { + // The exit block of the __unoptimzed__ region. + BasicBlock *ExitBB = R.getExitingBlock(); + // The merge block __just after__ the region and the optimized region. + BasicBlock *MergeBB = R.getExit(); + + // The exit block of the __optimized__ region. + BasicBlock *OptExitBB = *(pred_begin(MergeBB)); + if (OptExitBB == ExitBB) + OptExitBB = *(++pred_begin(MergeBB)); + + Builder.SetInsertPoint(OptExitBB->getTerminator()); + for (const auto &EscapeMapping : EscapeMap) { + // Extract the escaping instruction and the escaping users as well as the + // alloca the instructon was demoted to. + Instruction *EscapeInst = EscapeMapping.getFirst(); + const auto &EscapeMappingValue = EscapeMapping.getSecond(); + const EscapeUserVectorTy &EscapeUsers = EscapeMappingValue.second; + AllocaInst *ScalarAddr = EscapeMappingValue.first; + + // Reload the demoted instruction in the optimized version of the SCoP. + Instruction *EscapeInstReload = + Builder.CreateLoad(ScalarAddr, EscapeInst->getName() + ".final_reload"); + + // Create the merge phi that merges the optimized and unoptimized version. + PHINode *MergePHI = PHINode::Create(EscapeInst->getType(), 2, + EscapeInst->getName() + ".merge"); + MergePHI->insertBefore(MergeBB->getFirstInsertionPt()); + + // Add the respective values to the merge phi. + MergePHI->addIncoming(EscapeInstReload, OptExitBB); + MergePHI->addIncoming(EscapeInst, ExitBB); + + // The information of scalar evlution about the escaping instruction needs + // to be revoked so the new merged instruction will be used. + if (SE.isSCEVable(EscapeInst->getType())) + SE.forgetValue(EscapeInst); + + // Replace all uses of the demoted instruction with the merge phi. + for (Instruction *EUser : EscapeUsers) + EUser->replaceUsesOfWith(EscapeInst, MergePHI); + } +} + +void BlockGenerator::finalizeSCoP(Scop &S) { + createScalarInitialization(S.getRegion()); + createScalarFinalization(S.getRegion()); } VectorBlockGenerator::VectorBlockGenerator(BlockGenerator &BlockGen, Index: lib/CodeGen/IslCodeGeneration.cpp =================================================================== --- lib/CodeGen/IslCodeGeneration.cpp +++ lib/CodeGen/IslCodeGeneration.cpp @@ -72,6 +72,12 @@ void addParameters(__isl_take isl_set *Context); void create(__isl_take isl_ast_node *Node); + + /// @brief Finalize code generation for the SCoP @p S. + /// + /// @see BlockGenerator::finalizeSCoP(Scop &S) + void finalizeSCoP(Scop &S) { BlockGen.finalizeSCoP(S); } + IslExprBuilder &getExprBuilder() { return ExprBuilder; } private: @@ -83,7 +89,10 @@ SCEVExpander *Rewriter; IslExprBuilder ExprBuilder; + + /// @brief The generator used to copy a basic block. BlockGenerator BlockGen; + Pass *P; const DataLayout &DL; LoopInfo &LI; @@ -938,6 +947,8 @@ Builder.SetInsertPoint(StartBlock->begin()); NodeBuilder.create(AI->getAst()); + + NodeBuilder.finalizeSCoP(S); return true; } Index: test/Isl/CodeGen/phi_loop_carried_float.ll =================================================================== --- /dev/null +++ test/Isl/CodeGen/phi_loop_carried_float.ll @@ -0,0 +1,60 @@ +; RUN: opt %loadPolly -S -polly-model-phi-nodes -disable-polly-intra-scop-scalar-to-array -polly-codegen-isl < %s | FileCheck %s +; +; float f(float *A, int N) { +; float tmp = 0; +; for (int i = 0; i < N; i++) +; tmp += A[i]; +; } +; +; CHECK: polly.merge_new_and_old: +; CHECK-NOT: %tmp.0.merge +; +; CHECK: polly.start: +; CHECK-NEXT: store float 0.000000e+00, float* %tmp.0.phiops + +; CHECK: polly.merge: +; CHECK-NEXT: br label %polly.merge_new_and_old + +; CHECK: polly.stmt.bb1{{[0-9]*}}: +; CHECK-NEXT: %tmp.0.reload[[R1:[0-9]*]] = load float* %tmp.0.phiops +; CHECK-NEXT: store float %tmp.0.reload[[R1]], float* %tmp.0.s2a + +; CHECK: polly.stmt.bb1{{[0-9]*}}: +; CHECK-NEXT: %tmp.0.reload[[R2:[0-9]*]] = load float* %tmp.0.phiops +; CHECK-NEXT: store float %tmp.0.reload[[R2]], float* %tmp.0.s2a + +; CHECK: polly.stmt.bb4: ; preds = %polly.then3 +; CHECK: %tmp[[R5:[0-9]*]]_p_scalar_ = load float* %scevgep, align 4, !alias.scope !0, !noalias !2 +; CHECK: %tmp.0.reload[[R3:[0-9]*]] = load float* %tmp.0.s2a +; CHECK: %p_tmp[[R4:[0-9]*]] = fadd float %tmp.0.reload[[R3]], %tmp[[R5]]_p_scalar_ +; CHECK: store float %p_tmp[[R4]], float* %tmp.0.phiops + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(float* %A, i32 %N) { +bb: + %tmp = sext i32 %N to i64 + br label %bb1 + +bb1: ; preds = %bb4, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb4 ], [ 0, %bb ] + %tmp.0 = phi float [ 0.000000e+00, %bb ], [ %tmp7, %bb4 ] + %tmp2 = icmp slt i64 %indvars.iv, %tmp + br i1 %tmp2, label %bb3, label %bb8 + +bb3: ; preds = %bb1 + br label %bb4 + +bb4: ; preds = %bb3 + %tmp5 = getelementptr inbounds float* %A, i64 %indvars.iv + %tmp6 = load float* %tmp5, align 4 + %tmp7 = fadd float %tmp.0, %tmp6 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb1 + +bb8: ; preds = %bb1 + br label %exit + +exit: + ret void +} Index: test/Isl/CodeGen/phi_loop_carried_float_escape.ll =================================================================== --- /dev/null +++ test/Isl/CodeGen/phi_loop_carried_float_escape.ll @@ -0,0 +1,62 @@ +; RUN: opt %loadPolly -S -polly-model-phi-nodes -disable-polly-intra-scop-scalar-to-array -polly-codegen-isl < %s | FileCheck %s +; +; float f(float *A, int N) { +; float tmp = 0; +; for (int i = 0; i < N; i++) +; tmp += A[i]; +; return tmp; +; } +; +; CHECK: polly.merge_new_and_old: +; CHECK-NEXT: %tmp.0.merge = phi float [ %tmp.0.final_reload, %polly.merge ], [ %tmp.0, %bb8 ] +; +; CHECK: polly.start: +; CHECK-NEXT: store float 0.000000e+00, float* %tmp.0.phiops + +; CHECK: polly.merge: +; CHECK-NEXT: %tmp.0.final_reload = load float* %tmp.0.s2a +; CHECK-NEXT: br label %polly.merge_new_and_old + +; CHECK: polly.stmt.bb1{{[0-9]*}}: +; CHECK-NEXT: %tmp.0.reload[[R1:[0-9]*]] = load float* %tmp.0.phiops +; CHECK-NEXT: store float %tmp.0.reload[[R1]], float* %tmp.0.s2a + +; CHECK: polly.stmt.bb1{{[0-9]*}}: +; CHECK-NEXT: %tmp.0.reload[[R2:[0-9]*]] = load float* %tmp.0.phiops +; CHECK-NEXT: store float %tmp.0.reload[[R2]], float* %tmp.0.s2a + +; CHECK: polly.stmt.bb4: ; preds = %polly.then3 +; CHECK: %tmp[[R5:[0-9]*]]_p_scalar_ = load float* %scevgep, align 4, !alias.scope !0, !noalias !2 +; CHECK: %tmp.0.reload[[R3:[0-9]*]] = load float* %tmp.0.s2a +; CHECK: %p_tmp[[R4:[0-9]*]] = fadd float %tmp.0.reload[[R3]], %tmp[[R5]]_p_scalar_ +; CHECK: store float %p_tmp[[R4]], float* %tmp.0.phiops + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define float @f(float* %A, i32 %N) { +bb: + %tmp = sext i32 %N to i64 + br label %bb1 + +bb1: ; preds = %bb4, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb4 ], [ 0, %bb ] + %tmp.0 = phi float [ 0.000000e+00, %bb ], [ %tmp7, %bb4 ] + %tmp2 = icmp slt i64 %indvars.iv, %tmp + br i1 %tmp2, label %bb3, label %bb8 + +bb3: ; preds = %bb1 + br label %bb4 + +bb4: ; preds = %bb3 + %tmp5 = getelementptr inbounds float* %A, i64 %indvars.iv + %tmp6 = load float* %tmp5, align 4 + %tmp7 = fadd float %tmp.0, %tmp6 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb1 + +bb8: ; preds = %bb1 + br label %exit + +exit: + ret float %tmp.0 +} Index: test/Isl/CodeGen/phi_scalar_simple_1.ll =================================================================== --- /dev/null +++ test/Isl/CodeGen/phi_scalar_simple_1.ll @@ -0,0 +1,93 @@ +; RUN: opt %loadPolly -S -polly-model-phi-nodes -disable-polly-intra-scop-scalar-to-array -polly-codegen-isl < %s | FileCheck %s +; +; int jd(int *restrict A, int x, int N) { +; for (int i = 1; i < N; i++) +; for (int j = 3; j < N; j++) +; x += A[i]; +; return x; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define i32 @jd(i32* noalias %A, i32 %x, i32 %N) { +entry: +; CHECK-LABEL: entry: +; CHECK-DAG: %x.addr.1.lcssa.s2a21 = alloca i32 +; CHECK-DAG: %x.addr.1.lcssa.s2a = alloca i32 +; CHECK-DAG: %x.addr.1.s2a14 = alloca i32 +; CHECK-DAG: %x.addr.1.s2a = alloca i32 +; CHECK-DAG: %x.addr.0.s2a = alloca i32 +; CHECK-DAG: %x.addr.0.phiops = alloca i32 + %tmp = sext i32 %N to i64 + br label %for.cond + +; CHECK-LABEL: polly.merge_new_and_old: +; CHECK: %x.addr.0.merge = phi i32 [ %x.addr.0.final_reload, %polly.merge ], [ %x.addr.0, %for.cond ] +; CHECK: ret i32 %x.addr.0.merge + +; CHECK-LABEL: polly.start: +; CHECK-NEXT: store i32 %x, i32* %x.addr.0.phiops + +; CHECK-LABEL: polly.merge: +; CHECK: %x.addr.0.final_reload = load i32* %x.addr.0.s2a + +for.cond: ; preds = %for.inc4, %entry +; CHECK-LABEL: polly.stmt.for.cond{{[0-9]*}}: +; CHECK: %x.addr.0.reload[[R1:[0-9]*]] = load i32* %x.addr.0.phiops +; CHECK: store i32 %x.addr.0.reload[[R1]], i32* %x.addr.0.s2a + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc4 ], [ 1, %entry ] + %x.addr.0 = phi i32 [ %x, %entry ], [ %x.addr.1.lcssa, %for.inc4 ] + %cmp = icmp slt i64 %indvars.iv, %tmp + br i1 %cmp, label %for.body, label %for.end6 + +; CHECK-LABEL: polly.stmt.for.cond{{[0-9]*}}: +; CHECK: %x.addr.0.reload[[R1:[0-9]*]] = load i32* %x.addr.0.phiops +; CHECK: store i32 %x.addr.0.reload[[R1]], i32* %x.addr.0.s2a + +for.body: ; preds = %for.cond +; CHECK-LABEL: polly.stmt.for.body: +; CHECK: %.phiop_reload[[R2:[0-9]*]] = load i32* %x.addr.0.s2a +; CHECK: store i32 %.phiop_reload[[R2]], i32* %x.addr.1.s2a + br label %for.cond1 + +for.end: ; preds = %for.cond1 +; CHECK-LABEL: polly.stmt.for.end: +; CHECK-NEXT: %x.addr.1.lcssa.reload = load i32* %x.addr.1.lcssa.s2a +; CHECK-NEXT: store i32 %x.addr.1.lcssa.reload, i32* %x.addr.1.lcssa.s2a[[R4:[0-9]*]] + %x.addr.1.lcssa = phi i32 [ %x.addr.1, %for.cond1 ] + br label %for.inc4 + +for.inc4: ; preds = %for.end +; CHECK-LABEL: polly.stmt.for.inc4: +; CHECK: %.phiop_reload[[R5:[0-9]*]] = load i32* %x.addr.1.lcssa.s2a[[R4]] +; CHECK: store i32 %.phiop_reload[[R5]], i32* %x.addr.0.phiops + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.cond + +for.cond1: ; preds = %for.inc, %for.body +; CHECK-LABEL: polly.stmt.for.cond1: +; CHECK: %x.addr.1.reload = load i32* %x.addr.1.s2a +; CHECK: store i32 %x.addr.1.reload, i32* %x.addr.1.s2a[[R6:[0-9]*]] +; CHECK: store i32 %x.addr.1.reload, i32* %x.addr.1.lcssa.s2a + %x.addr.1 = phi i32 [ %x.addr.0, %for.body ], [ %add, %for.inc ] + %j.0 = phi i32 [ 3, %for.body ], [ %inc, %for.inc ] + %exitcond = icmp ne i32 %j.0, %N + br i1 %exitcond, label %for.body3, label %for.end + +for.body3: ; preds = %for.cond1 + br label %for.inc + +for.inc: ; preds = %for.body3 +; CHECK-LABEL: polly.stmt.for.inc: +; CHECK: %x.addr.1.reload[[R3:[0-9]*]] = load i32* %x.addr.1.s2a +; CHECK: %p_add = add nsw i32 %x.addr.1.reload[[R3]], %tmp1_p_scalar_ +; CHECK: store i32 %p_add, i32* %x.addr.1.s2a + %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv + %tmp1 = load i32* %arrayidx, align 4 + %add = add nsw i32 %x.addr.1, %tmp1 + %inc = add nsw i32 %j.0, 1 + br label %for.cond1 + +for.end6: ; preds = %for.cond + ret i32 %x.addr.0 +}