Index: polly/trunk/include/polly/CodeGen/BlockGenerators.h =================================================================== --- polly/trunk/include/polly/CodeGen/BlockGenerators.h +++ polly/trunk/include/polly/CodeGen/BlockGenerators.h @@ -63,6 +63,24 @@ /// @brief Generate a new basic block for a polyhedral statement. class BlockGenerator { public: + /// @brief Map types to resolve scalar dependences. + /// + ///@{ + + /// @see The ScalarMap and PHIOpMap member. + using ScalarAllocaMapTy = DenseMap; + + /// @brief Simple vector of instructions to store escape users. + using EscapeUserVectorTy = SmallVector; + + /// @brief Map type to resolve escaping users for scalar instructions. + /// + /// @see The EscapeMap member. + using EscapeUsersAllocaMapTy = + DenseMap>; + + ///@} + /// @brief Create a generator for basic blocks. /// /// @param Builder The LLVM-IR Builder used to generate the statement. The @@ -71,9 +89,14 @@ /// @param LI The loop info for the current function /// @param SE The scalar evolution info for the current function /// @param DT The dominator tree of this function. + /// @param ScalarMap Map from scalars to their demoted location. + /// @param PHIOpMap Map from PHIs to their demoted operand location. + /// @param EscapeMap Map from scalars to their escape users and locations. /// @param ExprBuilder An expression builder to generate new access functions. BlockGenerator(PollyIRBuilder &Builder, LoopInfo &LI, ScalarEvolution &SE, - DominatorTree &DT, IslExprBuilder *ExprBuilder = nullptr); + DominatorTree &DT, ScalarAllocaMapTy &ScalarMap, + ScalarAllocaMapTy &PHIOpMap, EscapeUsersAllocaMapTy &EscapeMap, + IslExprBuilder *ExprBuilder = nullptr); /// @brief Copy the basic block. /// @@ -87,6 +110,18 @@ /// @param LTS A map from old loops to new induction variables as SCEVs. void copyStmt(ScopStmt &Stmt, ValueMapT &GlobalMap, LoopToScevMapT <S); + /// @brief Finalize the code generation for the SCoP @p S. + /// + /// This will initialize and finalize the scalar variables we demoted during + /// the code generation. + /// + /// @see createScalarInitialization(Region &, ValueMapT &) + /// @see createScalarFinalization(Region &) + void finalizeSCoP(Scop &S, ValueMapT &VMap); + + /// @brief An empty destructor + virtual ~BlockGenerator(){}; + protected: PollyIRBuilder &Builder; LoopInfo &LI; @@ -96,6 +131,42 @@ /// @brief The dominator tree of this function. DominatorTree &DT; + /// @brief The entry block of the current function. + BasicBlock *EntryBB; + + /// @brief Maps to resolve scalar dependences for PHI operands and scalars. + /// + /// Usage example: + /// + /// x1 = ... // x1 will be inserted in the ScalarMap and PhiOpMap. + /// for (i=0...N) { + /// x2 = phi(x1, add) // x2 will be inserted in the ScalarMap, x1 and + /// // add are mapped in the PHIOpMap. + /// add = x2 + A[i]; // add will be inserted in the ScalarMap and + /// // the PhiOpMap. + /// } + /// print(x1) // x1 is mapped in the ScalarMap. + /// print(x2) // x2 is mapped in the ScalarMap. + /// print(add) // add is mapped in the ScalarMap. + /// + ///{ + + /// The PHIOpMap is used to get the alloca to communicate a value to a PHI + /// node, hence when the operand of a PHI is demoted the corresponding write + /// access will use the PHIOpMap to look for the correct alloca. PHI nodes + /// will then read that location in order to get the correct/current operand + /// value. + ScalarAllocaMapTy &PHIOpMap; + + /// The ScalarMap is used in __all__ other cases, thus always when a scalar + /// variable is read/written and the write is not because the scalar is a PHI + /// operand. + ScalarAllocaMapTy &ScalarMap; + ///} + + /// @brief Map from instructions to their escape users as well as the alloca. + EscapeUsersAllocaMapTy &EscapeMap; + /// @brief Split @p BB to create a new one we can use to clone @p BB in. BasicBlock *splitBB(BasicBlock *BB); @@ -128,6 +199,64 @@ void copyBB(ScopStmt &Stmt, BasicBlock *BB, BasicBlock *BBCopy, ValueMapT &BBMap, ValueMapT &GlobalMap, LoopToScevMapT <S); + /// @brief Return the alloca for @p ScalarBase in @p Map. + /// + /// If no alloca was mapped to @p ScalarBase in @p Map a new one is created + /// and named after @p ScalarBase with the suffix @p NameExt. + /// + /// @param ScalarBase The demoted scalar instruction. + /// @param Map The map we should look for a mapped alloca instruction. + /// @param NameExt The suffix we add to the name of a new created alloca. + /// @param IsNew If set it will hold true iff the alloca was created. + /// + /// @returns The alloca for @p ScalarBase in @p Map. + AllocaInst *getOrCreateAlloca(Instruction *ScalarBase, ScalarAllocaMapTy &Map, + const char *NameExt = ".s2a", + bool *IsNew = nullptr); + + /// @brief Generate reload of scalars demoted to memory and needed by @p Inst. + /// + /// @param Stmt The statement we generate code for. + /// @param Inst The instruction that might need reloaded values. + /// @param BBMap A mapping from old values to their new values in this block. + virtual void generateScalarLoads(ScopStmt &Stmt, const Instruction *Inst, + ValueMapT &BBMap); + + /// @brief Generate the scalar stores for the given statement. + /// + /// After the statement @p Stmt was copied all inner-SCoP scalar dependences + /// starting in @p Stmt (hence all scalar write accesses in @p Stmt) need to + /// be demoted to memory. + /// + /// @param Stmt The statement we generate code for. + /// @param BB The basic block we generate code for. + /// @param BBMap A mapping from old values to their new values in this block. + /// @param GlobalMap A mapping for globally replaced values. + virtual void generateScalarStores(ScopStmt &Stmt, BasicBlock *BB, + ValueMapT &BBMAp, ValueMapT &GlobalMap); + + /// @brief Handle users of @p Inst outside the SCoP. + /// + /// @param R The current SCoP region. + /// @param Inst The current instruction we check. + /// @param InstCopy The copy of the instruction @p Inst in the optimized SCoP. + void handleOutsideUsers(const Region &R, Instruction *Inst, Value *InstCopy); + + /// @brief Initialize the memory of demoted scalars. + /// + /// If a PHI node was demoted and one of its predecessor blocks was outside + /// the SCoP we need to initialize the memory cell we demoted the PHI into + /// with the value corresponding to that predecessor. As a SCoP is a + /// __single__ entry region there is at most one such predecessor. + void createScalarInitialization(Region &R, ValueMapT &VMap); + + /// @brief Promote the values of demoted scalars after the SCoP. + /// + /// If a scalar value was used outside the SCoP we need to promote the value + /// stored in the memory cell allocated for that scalar and combine it with + /// the original value in the non-optimized SCoP. + void createScalarFinalization(Region &R); + /// @brief Get the new version of a value. /// /// Given an old value, we first check if a new version of this value is @@ -183,6 +312,17 @@ ValueMapT &BBMap, ValueMapT &GlobalMap, LoopToScevMapT <S); + /// @brief Copy a single PHI instruction. + /// + /// The implementation in the BlockGenerator is trivial, however it allows + /// subclasses to handle PHIs different. + /// + /// @returns The nullptr as the BlockGenerator does not copy PHIs. + virtual Value *copyPHIInstruction(ScopStmt &, const PHINode *, ValueMapT &, + ValueMapT &, LoopToScevMapT &) { + return nullptr; + } + /// @brief Copy a single Instruction. /// /// This copies a single Instruction and updates references to old values @@ -202,6 +342,22 @@ void copyInstruction(ScopStmt &Stmt, const Instruction *Inst, ValueMapT &BBMap, ValueMapT &GlobalMap, LoopToScevMapT <S); + + /// @brief Helper to get the newest version of @p ScalarValue. + /// + /// @param ScalarValue The original value needed. + /// @param R The current SCoP region. + /// @param ReloadMap The scalar map for demoted values. + /// @param BBMap A mapping from old values to their new values + /// (for values recalculated within this basic block). + /// @param GlobalMap A mapping from old values to their new values + /// (for values recalculated in the new ScoP, but not + /// within this basic block). + /// + /// @returns The newest version (e.g., reloaded) of the scalar value. + Value *getNewScalarValue(Value *ScalarValue, const Region &R, + ScalarAllocaMapTy &ReloadMap, ValueMapT &BBMap, + ValueMapT &GlobalMap); }; /// @brief Generate a new vector basic block for a polyhedral statement. @@ -374,12 +530,82 @@ /// @param LTS A map from old loops to new induction variables as SCEVs. void copyStmt(ScopStmt &Stmt, ValueMapT &GlobalMap, LoopToScevMapT <S); + /// @brief An empty destructor + virtual ~RegionGenerator(){}; + private: + /// @brief A map from old to new blocks in the region. + DenseMap BlockMap; + + /// @brief The "BBMaps" for the whole region (one for each block). + DenseMap RegionMaps; + + /// @brief Mapping to remember PHI nodes that still need incoming values. + using PHINodePairTy = std::pair; + DenseMap> IncompletePHINodeMap; + /// @brief Repair the dominance tree after we created a copy block for @p BB. /// /// @returns The immediate dominator in the DT for @p BBCopy if in the region. - BasicBlock *repairDominance(BasicBlock *BB, BasicBlock *BBCopy, - DenseMap &BlockMap); + BasicBlock *repairDominance(BasicBlock *BB, BasicBlock *BBCopy); + + /// @brief Add the new operand from the copy of @p IncomingBB to @p PHICopy. + /// + /// @param Stmt The statement to code generate. + /// @param PHI The original PHI we copy. + /// @param PHICopy The copy of @p PHI. + /// @param IncomingBB An incoming block of @p PHI. + /// @param GlobalMap A mapping from old values to their new values + /// (for values recalculated in the new ScoP, but not + /// within this basic block). + /// @param LTS A map from old loops to new induction variables as + /// SCEVs. + void addOperandToPHI(ScopStmt &Stmt, const PHINode *PHI, PHINode *PHICopy, + BasicBlock *IncomingBB, ValueMapT &GlobalMap, + LoopToScevMapT <S); + + /// @brief Generate reload of scalars demoted to memory and needed by @p Inst. + /// + /// @param Stmt The statement we generate code for. + /// @param Inst The instruction that might need reloaded values. + /// @param BBMap A mapping from old values to their new values in this block. + virtual void generateScalarLoads(ScopStmt &Stmt, const Instruction *Inst, + ValueMapT &BBMap) override; + + /// @brief Generate the scalar stores for the given statement. + /// + /// After the statement @p Stmt was copied all inner-SCoP scalar dependences + /// starting in @p Stmt (hence all scalar write accesses in @p Stmt) need to + /// be demoted to memory. + /// + /// @param Stmt The statement we generate code for. + /// @param BB The basic block we generate code for. + /// @param BBMap A mapping from old values to their new values in this block. + /// @param GlobalMap A mapping from old values to their new values + /// (for values recalculated in the new ScoP, but not + /// within this basic block). + virtual void generateScalarStores(ScopStmt &Stmt, BasicBlock *BB, + ValueMapT &BBMAp, + ValueMapT &GlobalMap) override; + + /// @brief Copy a single PHI instruction. + /// + /// This copies a single PHI instruction and updates references to old values + /// with references to new values, as defined by GlobalMap and BBMap. + /// + /// @param Stmt The statement to code generate. + /// @param PHI The PHI instruction to copy. + /// @param BBMap A mapping from old values to their new values + /// (for values recalculated within this basic block). + /// @param GlobalMap A mapping from old values to their new values + /// (for values recalculated in the new ScoP, but not + /// within this basic block). + /// @param LTS A map from old loops to new induction variables as SCEVs. + /// + /// @returns The copied instruction or nullptr if no copy was made. + virtual Value *copyPHIInstruction(ScopStmt &Stmt, const PHINode *Inst, + ValueMapT &BBMap, ValueMapT &GlobalMap, + LoopToScevMapT <S) override; }; } #endif Index: polly/trunk/include/polly/CodeGen/IslNodeBuilder.h =================================================================== --- polly/trunk/include/polly/CodeGen/IslNodeBuilder.h +++ polly/trunk/include/polly/CodeGen/IslNodeBuilder.h @@ -32,13 +32,20 @@ DominatorTree &DT, Scop &S) : S(S), Builder(Builder), Annotator(Annotator), Rewriter(SE, DL, "polly"), ExprBuilder(Builder, IDToValue, Rewriter, DT, LI), - BlockGen(Builder, LI, SE, DT, &ExprBuilder), RegionGen(BlockGen), P(P), - DL(DL), LI(LI), SE(SE), DT(DT) {} + BlockGen(Builder, LI, SE, DT, ScalarMap, PHIOpMap, EscapeMap, + &ExprBuilder), + RegionGen(BlockGen), P(P), DL(DL), LI(LI), SE(SE), DT(DT) {} ~IslNodeBuilder() {} void addParameters(__isl_take isl_set *Context); void create(__isl_take isl_ast_node *Node); + + /// @brief Finalize code generation for the SCoP @p S. + /// + /// @see BlockGenerator::finalizeSCoP(Scop &S) + void finalizeSCoP(Scop &S) { BlockGen.finalizeSCoP(S, ValueMap); } + IslExprBuilder &getExprBuilder() { return ExprBuilder; } private: @@ -50,9 +57,26 @@ SCEVExpander Rewriter; IslExprBuilder ExprBuilder; + + /// @brief Maps used by the block and region generator to demote scalars. + /// + ///@{ + + /// @brief See BlockGenerator::ScalarMap. + BlockGenerator::ScalarAllocaMapTy ScalarMap; + + /// @brief See BlockGenerator::PhiOpMap. + BlockGenerator::ScalarAllocaMapTy PHIOpMap; + + /// @brief See BlockGenerator::EscapeMap. + BlockGenerator::EscapeUsersAllocaMapTy EscapeMap; + + ///@} + + /// @brief The generator used to copy a basic block. BlockGenerator BlockGen; - /// @brief Generator for region statements. + /// @brief The generator used to copy a non-affine region. RegionGenerator RegionGen; Pass *const P; Index: polly/trunk/include/polly/ScopInfo.h =================================================================== --- polly/trunk/include/polly/ScopInfo.h +++ polly/trunk/include/polly/ScopInfo.h @@ -25,6 +25,8 @@ #include "llvm/Analysis/RegionPass.h" #include "isl/ctx.h" +#include + using namespace llvm; namespace llvm { @@ -410,7 +412,11 @@ /// accesses. /// At the moment every statement represents a single basic block of LLVM-IR. class ScopStmt { - //===-------------------------------------------------------------------===// +public: + /// @brief List to hold all (scalar) memory accesses mapped to an instruction. + using MemoryAccessList = std::forward_list; + +private: ScopStmt(const ScopStmt &) = delete; const ScopStmt &operator=(const ScopStmt &) = delete; @@ -476,7 +482,9 @@ /// The only side effects of a statement are its memory accesses. typedef SmallVector MemoryAccessVec; MemoryAccessVec MemAccs; - std::map InstructionToAccess; + + /// @brief Mapping from instructions to (scalar) memory accesses. + DenseMap InstructionToAccess; //@} @@ -628,16 +636,31 @@ /// @brief Return true if this statement represents a whole region. bool isRegionStmt() const { return R != nullptr; } + /// @brief Return the (scalar) memory accesses for @p Inst. + const MemoryAccessList &getAccessesFor(const Instruction *Inst) const { + MemoryAccessList *MAL = lookupAccessesFor(Inst); + assert(MAL && "Cannot get memory accesses because they do not exist!"); + return *MAL; + } + + /// @brief Return the (scalar) memory accesses for @p Inst if any. + MemoryAccessList *lookupAccessesFor(const Instruction *Inst) const { + auto It = InstructionToAccess.find(Inst); + return It == InstructionToAccess.end() ? nullptr : It->getSecond(); + } + + /// @brief Return the __first__ (scalar) memory access for @p Inst. const MemoryAccess &getAccessFor(const Instruction *Inst) const { - MemoryAccess *A = lookupAccessFor(Inst); - assert(A && "Cannot get memory access because it does not exist!"); - return *A; + MemoryAccess *MA = lookupAccessFor(Inst); + assert(MA && "Cannot get memory access because it does not exist!"); + return *MA; } + /// @brief Return the __first__ (scalar) memory access for @p Inst if any. MemoryAccess *lookupAccessFor(const Instruction *Inst) const { - std::map::const_iterator at = - InstructionToAccess.find(Inst); - return at == InstructionToAccess.end() ? NULL : at->second; + auto It = InstructionToAccess.find(Inst); + return It == InstructionToAccess.end() ? nullptr + : &It->getSecond()->front(); } void setBasicBlock(BasicBlock *Block) { Index: polly/trunk/lib/Analysis/ScopInfo.cpp =================================================================== --- polly/trunk/lib/Analysis/ScopInfo.cpp +++ polly/trunk/lib/Analysis/ScopInfo.cpp @@ -27,6 +27,7 @@ #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/LoopInfo.h" @@ -877,19 +878,11 @@ if (isApproximated && Access.isWrite()) Access.setMayWrite(); - MemAccs.push_back( - new MemoryAccess(Access, AccessInst, this, SAI, MemAccs.size())); - - // We do not track locations for scalar memory accesses at the moment. - // - // We do not have a use for this information at the moment. If we need this - // at some point, the "instruction -> access" mapping needs to be enhanced - // as a single instruction could then possibly perform multiple accesses. - if (!Access.isScalar()) { - assert(!InstructionToAccess.count(AccessInst) && - "Unexpected 1-to-N mapping on instruction to access map!"); - InstructionToAccess[AccessInst] = MemAccs.back(); - } + MemoryAccessList *&MAL = InstructionToAccess[AccessInst]; + if (!MAL) + MAL = new MemoryAccessList(); + MAL->emplace_front(Access, AccessInst, this, SAI, MemAccs.size()); + MemAccs.push_back(&MAL->front()); } } @@ -1258,11 +1251,7 @@ } ScopStmt::~ScopStmt() { - while (!MemAccs.empty()) { - delete MemAccs.back(); - MemAccs.pop_back(); - } - + DeleteContainerSeconds(InstructionToAccess); isl_set_free(Domain); isl_map_free(Schedule); } Index: polly/trunk/lib/CodeGen/BlockGenerators.cpp =================================================================== --- polly/trunk/lib/CodeGen/BlockGenerators.cpp +++ polly/trunk/lib/CodeGen/BlockGenerators.cpp @@ -81,8 +81,13 @@ BlockGenerator::BlockGenerator(PollyIRBuilder &B, LoopInfo &LI, ScalarEvolution &SE, DominatorTree &DT, + ScalarAllocaMapTy &ScalarMap, + ScalarAllocaMapTy &PHIOpMap, + EscapeUsersAllocaMapTy &EscapeMap, IslExprBuilder *ExprBuilder) - : Builder(B), LI(LI), SE(SE), ExprBuilder(ExprBuilder), DT(DT) {} + : Builder(B), LI(LI), SE(SE), ExprBuilder(ExprBuilder), DT(DT), + EntryBB(nullptr), PHIOpMap(PHIOpMap), ScalarMap(ScalarMap), + EscapeMap(EscapeMap) {} Value *BlockGenerator::getNewValue(ScopStmt &Stmt, const Value *Old, ValueMapT &BBMap, ValueMapT &GlobalMap, @@ -242,13 +247,22 @@ void BlockGenerator::copyInstruction(ScopStmt &Stmt, const Instruction *Inst, ValueMapT &BBMap, ValueMapT &GlobalMap, LoopToScevMapT <S) { + + // First check for possible scalar dependences for this instruction. + generateScalarLoads(Stmt, Inst, BBMap); + // Terminator instructions control the control flow. They are explicitly // expressed in the clast and do not need to be copied. if (Inst->isTerminator()) return; - if (canSynthesize(Inst, &LI, &SE, &Stmt.getParent()->getRegion())) + Loop *L = getLoopForInst(Inst); + if ((Stmt.isBlockStmt() || !Stmt.getRegion()->contains(L)) && + canSynthesize(Inst, &LI, &SE, &Stmt.getParent()->getRegion())) { + Value *NewValue = getNewValue(Stmt, Inst, BBMap, GlobalMap, LTS, L); + BBMap[Inst] = NewValue; return; + } if (const LoadInst *Load = dyn_cast(Inst)) { Value *NewLoad = generateScalarLoad(Stmt, Load, BBMap, GlobalMap, LTS); @@ -266,6 +280,11 @@ return; } + if (const PHINode *PHI = dyn_cast(Inst)) { + copyPHIInstruction(Stmt, PHI, BBMap, GlobalMap, LTS); + return; + } + // Skip some special intrinsics for which we do not adjust the semantics to // the new schedule. All others are handled like every other instruction. if (auto *IT = dyn_cast(Inst)) { @@ -323,8 +342,329 @@ ValueMapT &BBMap, ValueMapT &GlobalMap, LoopToScevMapT <S) { Builder.SetInsertPoint(CopyBB->begin()); + EntryBB = &CopyBB->getParent()->getEntryBlock(); + for (Instruction &Inst : *BB) copyInstruction(Stmt, &Inst, BBMap, GlobalMap, LTS); + + // After a basic block was copied store all scalars that escape this block + // in their alloca. First the scalars that have dependences inside the SCoP, + // then the ones that might escape the SCoP. + generateScalarStores(Stmt, BB, BBMap, GlobalMap); + + const Region &R = Stmt.getParent()->getRegion(); + for (Instruction &Inst : *BB) + handleOutsideUsers(R, &Inst, BBMap[&Inst]); +} + +AllocaInst *BlockGenerator::getOrCreateAlloca(Instruction *ScalarBase, + ScalarAllocaMapTy &Map, + const char *NameExt, + bool *IsNew) { + + // Check if an alloca was cached for the base instruction. + AllocaInst *&Addr = Map[ScalarBase]; + + // If needed indicate if it was found already or will be created. + if (IsNew) + *IsNew = (Addr == nullptr); + + // If no alloca was found create one and insert it in the entry block. + if (!Addr) { + auto *Ty = ScalarBase->getType(); + Addr = new AllocaInst(Ty, ScalarBase->getName() + NameExt); + Addr->insertBefore(EntryBB->getFirstInsertionPt()); + } + + return Addr; +} + +void BlockGenerator::handleOutsideUsers(const Region &R, Instruction *Inst, + Value *InstCopy) { + BasicBlock *ExitBB = R.getExit(); + + EscapeUserVectorTy EscapeUsers; + for (User *U : Inst->users()) { + + // Non-instruction user will never escape. + Instruction *UI = dyn_cast(U); + if (!UI) + continue; + + if (R.contains(UI) && ExitBB != UI->getParent()) + continue; + + EscapeUsers.push_back(UI); + } + + // Exit if no escape uses were found. + if (EscapeUsers.empty()) + return; + + // If there are escape users we get the alloca for this instruction and put + // it in the EscapeMap for later finalization. However, if the alloca was not + // created by an already handled scalar dependence we have to initialize it + // also. Lastly, if the instruction was copied multiple times we already did + // this and can exit. + if (EscapeMap.count(Inst)) + return; + + // Get or create an escape alloca for this instruction. + bool IsNew; + AllocaInst *ScalarAddr = + getOrCreateAlloca(Inst, ScalarMap, ".escape", &IsNew); + + // Remember that this instruction has escape uses and the escape alloca. + EscapeMap[Inst] = std::make_pair(ScalarAddr, std::move(EscapeUsers)); + + // If the escape alloca was just created store the instruction in there, + // otherwise that happened already. + if (IsNew) { + assert(InstCopy && "Except PHIs every instruction should have a copy!"); + Builder.CreateStore(InstCopy, ScalarAddr); + } +} + +void BlockGenerator::generateScalarLoads(ScopStmt &Stmt, + const Instruction *Inst, + ValueMapT &BBMap) { + + // Iterate over all memory accesses for the given instruction and handle all + // scalar reads. + if (ScopStmt::MemoryAccessList *MAL = Stmt.lookupAccessesFor(Inst)) { + for (MemoryAccess &MA : *MAL) { + if (!MA.isScalar() || !MA.isRead()) + continue; + + Instruction *ScalarBase = cast(MA.getBaseAddr()); + Instruction *ScalarInst = MA.getAccessInstruction(); + + PHINode *ScalarBasePHI = dyn_cast(ScalarBase); + + // This is either a common scalar use (second case) or the use of a phi + // operand by the PHI node (first case). + if (ScalarBasePHI == ScalarInst) { + AllocaInst *PHIOpAddr = + getOrCreateAlloca(ScalarBase, PHIOpMap, ".phiops"); + LoadInst *LI = + Builder.CreateLoad(PHIOpAddr, PHIOpAddr->getName() + ".reload"); + BBMap[ScalarBase] = LI; + } else { + // For non-PHI operand uses we look up the alloca in the ScalarMap, + // reload it and add the mapping to the ones in the current basic block. + AllocaInst *ScalarAddr = + getOrCreateAlloca(ScalarBase, ScalarMap, ".s2a"); + LoadInst *LI = + Builder.CreateLoad(ScalarAddr, ScalarAddr->getName() + ".reload"); + BBMap[ScalarBase] = LI; + } + } + } +} + +Value *BlockGenerator::getNewScalarValue(Value *ScalarValue, const Region &R, + ScalarAllocaMapTy &ReloadMap, + ValueMapT &BBMap, + ValueMapT &GlobalMap) { + // If the value we want to store is an instruction we might have demoted it + // in order to make it accessible here. In such a case a reload is + // necessary. If it is no instruction it will always be a value that + // dominates the current point and we can just use it. In total there are 4 + // options: + // (1) The value is no instruction ==> use the value. + // (2) The value is an instruction that was split out of the region prior to + // code generation ==> use the instruction as it dominates the region. + // (3) The value is an instruction: + // (a) The value was defined in the current block, thus a copy is in + // the BBMap ==> use the mapped value. + // (b) The value was defined in a previous block, thus we demoted it + // earlier ==> use the reloaded value. + Instruction *ScalarValueInst = dyn_cast(ScalarValue); + if (!ScalarValueInst) + return ScalarValue; + + if (!R.contains(ScalarValueInst)) { + if (Value *ScalarValueCopy = GlobalMap.lookup(ScalarValueInst)) + return /* Case (3a) */ ScalarValueCopy; + else + return /* Case 2 */ ScalarValue; + } + + if (Value *ScalarValueCopy = BBMap.lookup(ScalarValueInst)) + return /* Case (3a) */ ScalarValueCopy; + + // Case (3b) + assert(ReloadMap.count(ScalarValueInst) && + "ScalarInst not mapped in the block and not in the given reload map!"); + Value *ReloadAddr = ReloadMap[ScalarValueInst]; + ScalarValue = + Builder.CreateLoad(ReloadAddr, ReloadAddr->getName() + ".reload"); + + return ScalarValue; +} + +void BlockGenerator::generateScalarStores(ScopStmt &Stmt, BasicBlock *BB, + ValueMapT &BBMap, + ValueMapT &GlobalMap) { + const Region &R = Stmt.getParent()->getRegion(); + + assert(Stmt.isBlockStmt() && BB == Stmt.getBasicBlock() && + "Region statements need to use the generateScalarStores() " + "function in the RegionGenerator"); + + // Set to remember a store to the phiops alloca of a PHINode. It is needed as + // we might have multiple write accesses to the same PHI and while one is the + // self write of the PHI (to the ScalarMap alloca) the other is the write to + // the operand alloca (PHIOpMap). + SmallPtrSet SeenPHIs; + + // Iterate over all accesses in the given statement. + for (MemoryAccess *MA : Stmt) { + + // Skip non-scalar and read accesses. + if (!MA->isScalar() || MA->isRead()) + continue; + + Instruction *ScalarBase = cast(MA->getBaseAddr()); + Instruction *ScalarInst = MA->getAccessInstruction(); + PHINode *ScalarBasePHI = dyn_cast(ScalarBase); + + // Get the alloca node for the base instruction and the value we want to + // store. In total there are 4 options: + // (1) The base is no PHI, hence it is a simple scalar def-use chain. + // (2) The base is a PHI, + // (a) and the write is caused by an operand in the block. + // (b) and it is the PHI self write (same as case (1)). + // (c) (2a) and (2b) are not distinguishable. + // For case (1) and (2b) we get the alloca from the scalar map and the value + // we want to store is initialized with the instruction attached to the + // memory access. For case (2a) we get the alloca from the PHI operand map + // and the value we want to store is initialized with the incoming value for + // this block. The tricky case (2c) is when both (2a) and (2b) match. This + // happens if the PHI operand is in the same block as the PHI. To handle + // that we choose the alloca of (2a) first and (2b) for the next write + // access to that PHI (there must be 2). + Value *ScalarValue = nullptr; + AllocaInst *ScalarAddr = nullptr; + + if (!ScalarBasePHI) { + // Case (1) + ScalarAddr = getOrCreateAlloca(ScalarBase, ScalarMap, ".s2a"); + ScalarValue = ScalarInst; + } else { + int PHIIdx = ScalarBasePHI->getBasicBlockIndex(BB); + if (ScalarBasePHI != ScalarInst) { + // Case (2a) + assert(PHIIdx >= 0 && "Bad scalar write to PHI operand"); + SeenPHIs.insert(ScalarBasePHI); + ScalarAddr = getOrCreateAlloca(ScalarBase, PHIOpMap, ".phiops"); + ScalarValue = ScalarBasePHI->getIncomingValue(PHIIdx); + } else if (PHIIdx < 0) { + // Case (2b) + ScalarAddr = getOrCreateAlloca(ScalarBase, ScalarMap, ".s2a"); + ScalarValue = ScalarInst; + } else { + // Case (2c) + if (SeenPHIs.insert(ScalarBasePHI).second) { + // First access ==> same as (2a) + ScalarAddr = getOrCreateAlloca(ScalarBase, PHIOpMap, ".phiops"); + ScalarValue = ScalarBasePHI->getIncomingValue(PHIIdx); + } else { + // Second access ==> same as (2b) + ScalarAddr = getOrCreateAlloca(ScalarBase, ScalarMap, ".s2a"); + ScalarValue = ScalarInst; + } + } + } + + ScalarValue = + getNewScalarValue(ScalarValue, R, ScalarMap, BBMap, GlobalMap); + Builder.CreateStore(ScalarValue, ScalarAddr); + } +} + +void BlockGenerator::createScalarInitialization(Region &R, + ValueMapT &GlobalMap) { + // The split block __just before__ the region and optimized region. + BasicBlock *SplitBB = R.getEnteringBlock(); + BranchInst *SplitBBTerm = cast(SplitBB->getTerminator()); + assert(SplitBBTerm->getNumSuccessors() == 2 && "Bad region entering block!"); + + // Get the start block of the __optimized__ region. + BasicBlock *StartBB = SplitBBTerm->getSuccessor(0); + if (StartBB == R.getEntry()) + StartBB = SplitBBTerm->getSuccessor(1); + + // For each PHI predecessor outside the region store the incoming operand + // value prior to entering the optimized region. + Builder.SetInsertPoint(StartBB->getTerminator()); + + ScalarAllocaMapTy EmptyMap; + for (const auto &PHIOpMapping : PHIOpMap) { + const PHINode *PHI = cast(PHIOpMapping.getFirst()); + + // Check if this PHI has the split block as predecessor (that is the only + // possible predecessor outside the SCoP). + int idx = PHI->getBasicBlockIndex(SplitBB); + if (idx < 0) + continue; + + Value *ScalarValue = PHI->getIncomingValue(idx); + ScalarValue = + getNewScalarValue(ScalarValue, R, EmptyMap, GlobalMap, GlobalMap); + + // If the split block is the predecessor initialize the PHI operator alloca. + Builder.CreateStore(ScalarValue, PHIOpMapping.getSecond()); + } +} + +void BlockGenerator::createScalarFinalization(Region &R) { + // The exit block of the __unoptimized__ region. + BasicBlock *ExitBB = R.getExitingBlock(); + // The merge block __just after__ the region and the optimized region. + BasicBlock *MergeBB = R.getExit(); + + // The exit block of the __optimized__ region. + BasicBlock *OptExitBB = *(pred_begin(MergeBB)); + if (OptExitBB == ExitBB) + OptExitBB = *(++pred_begin(MergeBB)); + + Builder.SetInsertPoint(OptExitBB->getTerminator()); + for (const auto &EscapeMapping : EscapeMap) { + // Extract the escaping instruction and the escaping users as well as the + // alloca the instruction was demoted to. + Instruction *EscapeInst = EscapeMapping.getFirst(); + const auto &EscapeMappingValue = EscapeMapping.getSecond(); + const EscapeUserVectorTy &EscapeUsers = EscapeMappingValue.second; + AllocaInst *ScalarAddr = EscapeMappingValue.first; + + // Reload the demoted instruction in the optimized version of the SCoP. + Instruction *EscapeInstReload = + Builder.CreateLoad(ScalarAddr, EscapeInst->getName() + ".final_reload"); + + // Create the merge PHI that merges the optimized and unoptimized version. + PHINode *MergePHI = PHINode::Create(EscapeInst->getType(), 2, + EscapeInst->getName() + ".merge"); + MergePHI->insertBefore(MergeBB->getFirstInsertionPt()); + + // Add the respective values to the merge PHI. + MergePHI->addIncoming(EscapeInstReload, OptExitBB); + MergePHI->addIncoming(EscapeInst, ExitBB); + + // The information of scalar evolution about the escaping instruction needs + // to be revoked so the new merged instruction will be used. + if (SE.isSCEVable(EscapeInst->getType())) + SE.forgetValue(EscapeInst); + + // Replace all uses of the demoted instruction with the merge PHI. + for (Instruction *EUser : EscapeUsers) + EUser->replaceUsesOfWith(EscapeInst, MergePHI); + } +} + +void BlockGenerator::finalizeSCoP(Scop &S, ValueMapT &GlobalMap) { + createScalarInitialization(S.getRegion(), GlobalMap); + createScalarFinalization(S.getRegion()); } VectorBlockGenerator::VectorBlockGenerator(BlockGenerator &BlockGen, @@ -679,9 +1019,8 @@ copyInstruction(Stmt, &Inst, VectorBlockMap, ScalarBlockMap); } -BasicBlock *RegionGenerator::repairDominance( - BasicBlock *BB, BasicBlock *BBCopy, - DenseMap &BlockMap) { +BasicBlock *RegionGenerator::repairDominance(BasicBlock *BB, + BasicBlock *BBCopy) { BasicBlock *BBIDom = DT.getNode(BB)->getIDom()->getBlock(); BasicBlock *BBCopyIDom = BlockMap.lookup(BBIDom); @@ -697,20 +1036,31 @@ assert(Stmt.isRegionStmt() && "Only region statements can be copied by the block generator"); + // Forget all old mappings. + BlockMap.clear(); + RegionMaps.clear(); + IncompletePHINodeMap.clear(); + // The region represented by the statement. Region *R = Stmt.getRegion(); - // The "BBMaps" for the whole region. - DenseMap RegionMaps; + // Create a dedicated entry for the region where we can reload all demoted + // inputs. + BasicBlock *EntryBB = R->getEntry(); + BasicBlock *EntryBBCopy = + SplitBlock(Builder.GetInsertBlock(), Builder.GetInsertPoint(), &DT, &LI); + EntryBBCopy->setName("polly.stmt." + EntryBB->getName() + ".entry"); + Builder.SetInsertPoint(EntryBBCopy->begin()); - // A map from old to new blocks in the region - DenseMap BlockMap; + for (auto PI = pred_begin(EntryBB), PE = pred_end(EntryBB); PI != PE; ++PI) + if (!R->contains(*PI)) + BlockMap[*PI] = EntryBBCopy; // Iterate over all blocks in the region in a breadth-first search. std::deque Blocks; SmallPtrSet SeenBlocks; - Blocks.push_back(R->getEntry()); - SeenBlocks.insert(R->getEntry()); + Blocks.push_back(EntryBB); + SeenBlocks.insert(EntryBB); while (!Blocks.empty()) { BasicBlock *BB = Blocks.front(); @@ -718,7 +1068,10 @@ // First split the block and update dominance information. BasicBlock *BBCopy = splitBB(BB); - BasicBlock *BBCopyIDom = repairDominance(BB, BBCopy, BlockMap); + BasicBlock *BBCopyIDom = repairDominance(BB, BBCopy); + + // In order to remap PHI nodes we store also basic block mappings. + BlockMap[BB] = BBCopy; // Get the mapping for this block and initialize it with the mapping // available at its immediate dominator (in the new region). @@ -728,22 +1081,28 @@ // Copy the block with the BlockGenerator. copyBB(Stmt, BB, BBCopy, RegionMap, GlobalMap, LTS); + // In order to remap PHI nodes we store also basic block mappings. + BlockMap[BB] = BBCopy; + + // Add values to incomplete PHI nodes waiting for this block to be copied. + for (const PHINodePairTy &PHINodePair : IncompletePHINodeMap[BB]) + addOperandToPHI(Stmt, PHINodePair.first, PHINodePair.second, BB, + GlobalMap, LTS); + IncompletePHINodeMap[BB].clear(); + // And continue with new successors inside the region. for (auto SI = succ_begin(BB), SE = succ_end(BB); SI != SE; SI++) if (R->contains(*SI) && SeenBlocks.insert(*SI).second) Blocks.push_back(*SI); - - // In order to remap PHI nodes we store also basic block mappings. - BlockMap[BB] = BBCopy; } // Now create a new dedicated region exit block and add it to the region map. BasicBlock *ExitBBCopy = SplitBlock(Builder.GetInsertBlock(), Builder.GetInsertPoint(), &DT, &LI); - ExitBBCopy->setName("polly.stmt." + R->getExit()->getName() + ".as.exit"); + ExitBBCopy->setName("polly.stmt." + R->getExit()->getName() + ".exit"); BlockMap[R->getExit()] = ExitBBCopy; - repairDominance(R->getExit(), ExitBBCopy, BlockMap); + repairDominance(R->getExit(), ExitBBCopy); // As the block generator doesn't handle control flow we need to add the // region control flow by hand after all blocks have been copied. @@ -762,6 +1121,178 @@ BICopy->eraseFromParent(); } + // Add counting PHI nodes to all loops in the region that can be used as + // replacement for SCEVs refering to the old loop. + for (BasicBlock *BB : SeenBlocks) { + Loop *L = LI.getLoopFor(BB); + if (L == nullptr || L->getHeader() != BB) + continue; + + BasicBlock *BBCopy = BlockMap[BB]; + Value *NullVal = Builder.getInt32(0); + PHINode *LoopPHI = + PHINode::Create(Builder.getInt32Ty(), 2, "polly.subregion.iv"); + Instruction *LoopPHIInc = BinaryOperator::CreateAdd( + LoopPHI, Builder.getInt32(1), "polly.subregion.iv.inc"); + LoopPHI->insertBefore(BBCopy->begin()); + LoopPHIInc->insertBefore(BBCopy->getTerminator()); + + for (auto *PredBB : make_range(pred_begin(BB), pred_end(BB))) { + if (!R->contains(PredBB)) + continue; + if (L->contains(PredBB)) + LoopPHI->addIncoming(LoopPHIInc, BlockMap[PredBB]); + else + LoopPHI->addIncoming(NullVal, BlockMap[PredBB]); + } + + for (auto *PredBBCopy : make_range(pred_begin(BBCopy), pred_end(BBCopy))) + if (LoopPHI->getBasicBlockIndex(PredBBCopy) < 0) + LoopPHI->addIncoming(NullVal, PredBBCopy); + + LTS[L] = SE.getUnknown(LoopPHI); + } + + // Add all mappings from the region to the global map so outside uses will use + // the copied instructions. + for (auto &BBMap : RegionMaps) + GlobalMap.insert(BBMap.second.begin(), BBMap.second.end()); + // Reset the old insert point for the build. Builder.SetInsertPoint(ExitBBCopy->begin()); } + +void RegionGenerator::generateScalarLoads(ScopStmt &Stmt, + const Instruction *Inst, + ValueMapT &BBMap) { + + // Inside a non-affine region PHI nodes are copied not demoted. Once the + // phi is copied it will reload all inputs from outside the region, hence + // we do not need to generate code for the read access of the operands of a + // PHI. + if (isa(Inst)) + return; + + return BlockGenerator::generateScalarLoads(Stmt, Inst, BBMap); +} + +void RegionGenerator::generateScalarStores(ScopStmt &Stmt, BasicBlock *BB, + ValueMapT &BBMap, + ValueMapT &GlobalMap) { + const Region &R = Stmt.getParent()->getRegion(); + + Region *StmtR = Stmt.getRegion(); + assert(StmtR && "Block statements need to use the generateScalarStores() " + "function in the BlockGenerator"); + + BasicBlock *ExitBB = StmtR->getExit(); + + // For region statements three kinds of scalar stores exists: + // (1) A definition used by a non-phi instruction outside the region. + // (2) A phi-instruction in the region entry. + // (3) A write to a phi instruction in the region exit. + // The last case is the tricky one since we do not know anymore which + // predecessor of the exit needs to store the operand value that doesn't + // have a definition in the region. Therefore, we have to check in each + // block in the region if we should store the value or not. + + // Iterate over all accesses in the given statement. + for (MemoryAccess *MA : Stmt) { + + // Skip non-scalar and read accesses. + if (!MA->isScalar() || MA->isRead()) + continue; + + Instruction *ScalarBase = cast(MA->getBaseAddr()); + Instruction *ScalarInst = MA->getAccessInstruction(); + PHINode *ScalarBasePHI = dyn_cast(ScalarBase); + + Value *ScalarValue = nullptr; + AllocaInst *ScalarAddr = nullptr; + + if (!ScalarBasePHI) { + // Case (1) + ScalarAddr = getOrCreateAlloca(ScalarBase, ScalarMap, ".s2a"); + ScalarValue = ScalarInst; + } else if (ScalarBasePHI->getParent() != ExitBB) { + // Case (2) + assert(ScalarBasePHI->getParent() == StmtR->getEntry() && + "Bad PHI self write in non-affine region"); + assert(ScalarBase == ScalarInst && + "Bad PHI self write in non-affine region"); + ScalarAddr = getOrCreateAlloca(ScalarBase, ScalarMap, ".s2a"); + ScalarValue = ScalarInst; + } else { + int PHIIdx = ScalarBasePHI->getBasicBlockIndex(BB); + // Skip accesses we will not handle in this basic block but in another one + // in the statement region. + if (PHIIdx < 0) + continue; + + // Case (3) + ScalarAddr = getOrCreateAlloca(ScalarBase, PHIOpMap, ".phiops"); + ScalarValue = ScalarBasePHI->getIncomingValue(PHIIdx); + } + + ScalarValue = + getNewScalarValue(ScalarValue, R, ScalarMap, BBMap, GlobalMap); + Builder.CreateStore(ScalarValue, ScalarAddr); + } +} + +void RegionGenerator::addOperandToPHI(ScopStmt &Stmt, const PHINode *PHI, + PHINode *PHICopy, BasicBlock *IncomingBB, + ValueMapT &GlobalMap, + LoopToScevMapT <S) { + Region *StmtR = Stmt.getRegion(); + + // If the incoming block was not yet copied mark this PHI as incomplete. + // Once the block will be copied the incoming value will be added. + BasicBlock *BBCopy = BlockMap[IncomingBB]; + if (!BBCopy) { + assert(StmtR->contains(IncomingBB) && + "Bad incoming block for PHI in non-affine region"); + IncompletePHINodeMap[IncomingBB].push_back(std::make_pair(PHI, PHICopy)); + return; + } + + Value *OpCopy = nullptr; + if (StmtR->contains(IncomingBB)) { + assert(RegionMaps.count(BBCopy) && + "Incoming PHI block did not have a BBMap"); + ValueMapT &BBCopyMap = RegionMaps[BBCopy]; + + Value *Op = PHI->getIncomingValueForBlock(IncomingBB); + OpCopy = + getNewValue(Stmt, Op, BBCopyMap, GlobalMap, LTS, getLoopForInst(PHI)); + } else { + + if (PHICopy->getBasicBlockIndex(BBCopy) >= 0) + return; + + AllocaInst *PHIOpAddr = + getOrCreateAlloca(const_cast(PHI), PHIOpMap, ".phiops"); + OpCopy = new LoadInst(PHIOpAddr, PHIOpAddr->getName() + ".reload", + BlockMap[IncomingBB]->getTerminator()); + } + + assert(OpCopy && "Incoming PHI value was not copied properly"); + assert(BBCopy && "Incoming PHI block was not copied properly"); + PHICopy->addIncoming(OpCopy, BBCopy); +} + +Value *RegionGenerator::copyPHIInstruction(ScopStmt &Stmt, const PHINode *PHI, + ValueMapT &BBMap, + ValueMapT &GlobalMap, + LoopToScevMapT <S) { + unsigned NumIncoming = PHI->getNumIncomingValues(); + PHINode *PHICopy = + Builder.CreatePHI(PHI->getType(), NumIncoming, "polly." + PHI->getName()); + PHICopy->moveBefore(PHICopy->getParent()->getFirstNonPHI()); + BBMap[PHI] = PHICopy; + + for (unsigned u = 0; u < NumIncoming; u++) + addOperandToPHI(Stmt, PHI, PHICopy, PHI->getIncomingBlock(u), GlobalMap, + LTS); + return PHICopy; +} Index: polly/trunk/lib/CodeGen/CodeGeneration.cpp =================================================================== --- polly/trunk/lib/CodeGen/CodeGeneration.cpp +++ polly/trunk/lib/CodeGen/CodeGeneration.cpp @@ -131,6 +131,8 @@ NodeBuilder.create(AstRoot); + NodeBuilder.finalizeSCoP(S); + assert(!verifyGeneratedFunction(S, *EnteringBB->getParent()) && "Verification of generated function failed"); return true; Index: polly/trunk/test/Isl/CodeGen/phi_condition_modeling_1.ll =================================================================== --- polly/trunk/test/Isl/CodeGen/phi_condition_modeling_1.ll +++ polly/trunk/test/Isl/CodeGen/phi_condition_modeling_1.ll @@ -0,0 +1,59 @@ +; RUN: opt %loadPolly -S -polly-no-early-exit -polly-detect-unprofitable -polly-model-phi-nodes -polly-codegen < %s | FileCheck %s +; +; void f(int *A, int c, int N) { +; int tmp; +; for (int i = 0; i < N; i++) { +; if (i > c) +; tmp = 3; +; else +; tmp = 5; +; A[i] = tmp; +; } +; } +; +; CHECK-LABEL: bb: +; CHECK: %tmp.0.phiops = alloca i32 +; CHECK-LABEL: polly.stmt.bb8: +; CHECK: %tmp.0.phiops.reload = load i32, i32* %tmp.0.phiops +; CHECK: store i32 %tmp.0.phiops.reload, i32* +; CHECK-LABEL: polly.stmt.bb6: +; CHECK: store i32 3, i32* %tmp.0.phiops +; CHECK-LABEL: polly.stmt.bb7: +; CHECK: store i32 5, i32* %tmp.0.phiops + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* %A, i32 %c, i32 %N) { +bb: + %tmp = sext i32 %N to i64 + %tmp1 = sext i32 %c to i64 + br label %bb2 + +bb2: ; preds = %bb10, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb10 ], [ 0, %bb ] + %tmp3 = icmp slt i64 %indvars.iv, %tmp + br i1 %tmp3, label %bb4, label %bb11 + +bb4: ; preds = %bb2 + %tmp5 = icmp sgt i64 %indvars.iv, %tmp1 + br i1 %tmp5, label %bb6, label %bb7 + +bb6: ; preds = %bb4 + br label %bb8 + +bb7: ; preds = %bb4 + br label %bb8 + +bb8: ; preds = %bb7, %bb6 + %tmp.0 = phi i32 [ 3, %bb6 ], [ 5, %bb7 ] + %tmp9 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 %tmp.0, i32* %tmp9, align 4 + br label %bb10 + +bb10: ; preds = %bb8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb2 + +bb11: ; preds = %bb2 + ret void +} Index: polly/trunk/test/Isl/CodeGen/phi_condition_modeling_2.ll =================================================================== --- polly/trunk/test/Isl/CodeGen/phi_condition_modeling_2.ll +++ polly/trunk/test/Isl/CodeGen/phi_condition_modeling_2.ll @@ -0,0 +1,66 @@ +; RUN: opt %loadPolly -S -polly-no-early-exit -polly-detect-unprofitable -polly-model-phi-nodes -disable-polly-intra-scop-scalar-to-array -polly-codegen < %s | FileCheck %s +; +; void f(int *A, int c, int N) { +; int tmp; +; for (int i = 0; i < N; i++) { +; if (i > c) +; tmp = 3; +; else +; tmp = 5; +; A[i] = tmp; +; } +; } +; +; CHECK-LABEL: bb: +; CHECK-DAG: %tmp.0.s2a = alloca i32 +; CHECK-DAG: %tmp.0.phiops = alloca i32 +; CHECK-LABEL: polly.stmt.bb8: +; CHECK: %tmp.0.phiops.reload = load i32, i32* %tmp.0.phiops +; CHECK: store i32 %tmp.0.phiops.reload, i32* %tmp.0.s2a +; CHECK-LABEL: polly.stmt.bb8b: +; CHECK: %tmp.0.s2a.reload = load i32, i32* %tmp.0.s2a +; CHECK: store i32 %tmp.0.s2a.reload, +; CHECK-LABEL: polly.stmt.bb6: +; CHECK: store i32 3, i32* %tmp.0.phiops +; CHECK-LABEL: polly.stmt.bb7: +; CHECK: store i32 5, i32* %tmp.0.phiops + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* %A, i32 %c, i32 %N) { +bb: + %tmp = sext i32 %N to i64 + %tmp1 = sext i32 %c to i64 + br label %bb2 + +bb2: ; preds = %bb10, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb10 ], [ 0, %bb ] + %tmp3 = icmp slt i64 %indvars.iv, %tmp + br i1 %tmp3, label %bb4, label %bb11 + +bb4: ; preds = %bb2 + %tmp5 = icmp sgt i64 %indvars.iv, %tmp1 + br i1 %tmp5, label %bb6, label %bb7 + +bb6: ; preds = %bb4 + br label %bb8 + +bb7: ; preds = %bb4 + br label %bb8 + +bb8: ; preds = %bb7, %bb6 + %tmp.0 = phi i32 [ 3, %bb6 ], [ 5, %bb7 ] + br label %bb8b + +bb8b: + %tmp9 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 %tmp.0, i32* %tmp9, align 4 + br label %bb10 + +bb10: ; preds = %bb8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb2 + +bb11: ; preds = %bb2 + ret void +} Index: polly/trunk/test/Isl/CodeGen/phi_conditional_simple_1.ll =================================================================== --- polly/trunk/test/Isl/CodeGen/phi_conditional_simple_1.ll +++ polly/trunk/test/Isl/CodeGen/phi_conditional_simple_1.ll @@ -0,0 +1,72 @@ +; RUN: opt %loadPolly -analyze -polly-ast -polly-no-early-exit -polly-detect-unprofitable -polly-model-phi-nodes < %s | FileCheck %s --check-prefix=AST +; RUN: opt %loadPolly -S -polly-no-early-exit -polly-detect-unprofitable -polly-model-phi-nodes -polly-codegen < %s | FileCheck %s +; +; void jd(int *A, int c) { +; for (int i = 0; i < 1024; i++) { +; if (c) +; A[i] = 1; +; else +; A[i] = 2; +; } +; } + +; AST: for (int c0 = 0; c0 <= 1023; c0 += 1) { +; AST: if (c <= -1) { +; AST: Stmt_if_then(c0); +; AST: } else if (c >= 1) { +; AST: Stmt_if_then(c0); +; AST: } else +; AST: Stmt_if_else(c0); +; AST: Stmt_if_end(c0); +; AST: } +; +; CHECK-LABEL: entry: +; CHECK-NEXT: %phi.phiops = alloca i32 +; CHECK-LABEL: polly.stmt.if.end: +; CHECK-NEXT: %phi.phiops.reload = load i32, i32* %phi.phiops +; CHECK-NEXT: %scevgep +; CHECK-NEXT: store i32 %phi.phiops.reload, i32* +; CHECK-LABEL: polly.stmt.if.then: +; CHECK-NEXT: store i32 1, i32* %phi.phiops +; CHECK-NEXT: br label %polly.merge{{[.]?}} +; CHECK-LABEL: polly.stmt.if.then{{.}}: +; CHECK-NEXT: store i32 1, i32* %phi.phiops +; CHECK-NEXT: br label %polly.merge{{[.]?}} +; CHECK-LABEL: polly.stmt.if.else: +; CHECK-NEXT: store i32 2, i32* %phi.phiops +; CHECK-NEXT: br label %polly.merge{{[.]?}} +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @jd(i32* %A, i32 %c) { +entry: + br label %for.cond + +for.cond: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ] + %exitcond = icmp ne i64 %indvars.iv, 1024 + br i1 %exitcond, label %for.body, label %for.end + +for.body: + %tobool = icmp eq i32 %c, 0 + br i1 %tobool, label %if.else, label %if.then + +if.then: + br label %if.end + +if.else: + br label %if.end + +if.end: + %phi = phi i32 [ 1, %if.then], [ 2, %if.else ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 %phi, i32* %arrayidx, align 4 + br label %for.inc + +for.inc: + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.cond + +for.end: + ret void +} Index: polly/trunk/test/Isl/CodeGen/phi_loop_carried_float.ll =================================================================== --- polly/trunk/test/Isl/CodeGen/phi_loop_carried_float.ll +++ polly/trunk/test/Isl/CodeGen/phi_loop_carried_float.ll @@ -0,0 +1,67 @@ +; RUN: opt %loadPolly -S -polly-no-early-exit -polly-detect-unprofitable -polly-model-phi-nodes -disable-polly-intra-scop-scalar-to-array -polly-codegen < %s | FileCheck %s +; +; float f(float *A, int N) { +; float tmp = 0; +; for (int i = 0; i < N; i++) +; tmp += A[i]; +; } +; +; CHECK: bb: +; CHECK-NOT: %tmp7{{[.*]}} = alloca float +; CHECK-DAG: %tmp.0.s2a = alloca float +; CHECK-NOT: %tmp7{{[.*]}} = alloca float +; CHECK-DAG: %tmp.0.phiops = alloca float +; CHECK-NOT: %tmp7{{[.*]}} = alloca float +; +; CHECK: polly.merge_new_and_old: +; CHECK-NEXT: ret +; +; CHECK: polly.start: +; CHECK-NEXT: store float 0.000000e+00, float* %tmp.0.phiops + +; CHECK: polly.merge: +; CHECK-NEXT: br label %polly.merge_new_and_old + +; CHECK: polly.stmt.bb1{{[0-9]*}}: +; CHECK-NEXT: %tmp.0.phiops.reload[[R1:[0-9]*]] = load float, float* %tmp.0.phiops +; CHECK: store float %tmp.0.phiops.reload[[R1]], float* %tmp.0.s2a + +; CHECK: polly.stmt.bb1{{[0-9]*}}: +; CHECK-NEXT: %tmp.0.phiops.reload[[R2:[0-9]*]] = load float, float* %tmp.0.phiops +; CHECK: store float %tmp.0.phiops.reload[[R2]], float* %tmp.0.s2a + +; CHECK: polly.stmt.bb4: ; preds = %polly.then3 +; CHECK: %tmp[[R5:[0-9]*]]_p_scalar_ = load float, float* %scevgep, align 4, !alias.scope !0, !noalias !2 +; CHECK: %tmp.0.s2a.reload[[R3:[0-9]*]] = load float, float* %tmp.0.s2a +; CHECK: %p_tmp[[R4:[0-9]*]] = fadd float %tmp.0.s2a.reload[[R3]], %tmp[[R5]]_p_scalar_ +; CHECK: store float %p_tmp[[R4]], float* %tmp.0.phiops + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(float* %A, i32 %N) { +bb: + %tmp = sext i32 %N to i64 + br label %bb1 + +bb1: ; preds = %bb4, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb4 ], [ 0, %bb ] + %tmp.0 = phi float [ 0.000000e+00, %bb ], [ %tmp7, %bb4 ] + %tmp2 = icmp slt i64 %indvars.iv, %tmp + br i1 %tmp2, label %bb3, label %bb8 + +bb3: ; preds = %bb1 + br label %bb4 + +bb4: ; preds = %bb3 + %tmp5 = getelementptr inbounds float, float* %A, i64 %indvars.iv + %tmp6 = load float, float* %tmp5, align 4 + %tmp7 = fadd float %tmp.0, %tmp6 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb1 + +bb8: ; preds = %bb1 + br label %exit + +exit: + ret void +} Index: polly/trunk/test/Isl/CodeGen/phi_loop_carried_float_escape.ll =================================================================== --- polly/trunk/test/Isl/CodeGen/phi_loop_carried_float_escape.ll +++ polly/trunk/test/Isl/CodeGen/phi_loop_carried_float_escape.ll @@ -0,0 +1,63 @@ +; RUN: opt %loadPolly -S -polly-no-early-exit -polly-detect-unprofitable -polly-model-phi-nodes -disable-polly-intra-scop-scalar-to-array -polly-codegen < %s | FileCheck %s +; +; float f(float *A, int N) { +; float tmp = 0; +; for (int i = 0; i < N; i++) +; tmp += A[i]; +; return tmp; +; } +; +; CHECK: polly.merge_new_and_old: +; CHECK-NEXT: %tmp.0.merge = phi float [ %tmp.0.final_reload, %polly.merge ], [ %tmp.0, %bb8 ] +; CHECK-NEXT: ret float %tmp.0.merge +; +; CHECK: polly.start: +; CHECK-NEXT: store float 0.000000e+00, float* %tmp.0.phiops + +; CHECK: polly.merge: +; CHECK-NEXT: %tmp.0.final_reload = load float, float* %tmp.0.s2a +; CHECK-NEXT: br label %polly.merge_new_and_old + +; CHECK: polly.stmt.bb1{{[0-9]*}}: +; CHECK-NEXT: %tmp.0.phiops.reload[[R1:[0-9]*]] = load float, float* %tmp.0.phiops +; CHECK-: store float %tmp.0.phiops.reload[[R1]], float* %tmp.0.s2a + +; CHECK: polly.stmt.bb1{{[0-9]*}}: +; CHECK-NEXT: %tmp.0.phiops.reload[[R2:[0-9]*]] = load float, float* %tmp.0.phiops +; CHECK: store float %tmp.0.phiops.reload[[R2]], float* %tmp.0.s2a + +; CHECK: polly.stmt.bb4: ; preds = %polly.then3 +; CHECK: %tmp[[R5:[0-9]*]]_p_scalar_ = load float, float* %scevgep, align 4, !alias.scope !0, !noalias !2 +; CHECK: %tmp.0.s2a.reload[[R3:[0-9]*]] = load float, float* %tmp.0.s2a +; CHECK: %p_tmp[[R4:[0-9]*]] = fadd float %tmp.0.s2a.reload[[R3]], %tmp[[R5]]_p_scalar_ +; CHECK: store float %p_tmp[[R4]], float* %tmp.0.phiops + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define float @f(float* %A, i32 %N) { +bb: + %tmp = sext i32 %N to i64 + br label %bb1 + +bb1: ; preds = %bb4, %bb + %indvars.iv = phi i64 [ %indvars.iv.next, %bb4 ], [ 0, %bb ] + %tmp.0 = phi float [ 0.000000e+00, %bb ], [ %tmp7, %bb4 ] + %tmp2 = icmp slt i64 %indvars.iv, %tmp + br i1 %tmp2, label %bb3, label %bb8 + +bb3: ; preds = %bb1 + br label %bb4 + +bb4: ; preds = %bb3 + %tmp5 = getelementptr inbounds float, float* %A, i64 %indvars.iv + %tmp6 = load float, float* %tmp5, align 4 + %tmp7 = fadd float %tmp.0, %tmp6 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %bb1 + +bb8: ; preds = %bb1 + br label %exit + +exit: + ret float %tmp.0 +} Index: polly/trunk/test/Isl/CodeGen/phi_scalar_simple_1.ll =================================================================== --- polly/trunk/test/Isl/CodeGen/phi_scalar_simple_1.ll +++ polly/trunk/test/Isl/CodeGen/phi_scalar_simple_1.ll @@ -0,0 +1,93 @@ +; RUN: opt %loadPolly -S -polly-detect-unprofitable -polly-model-phi-nodes -disable-polly-intra-scop-scalar-to-array -polly-no-early-exit -polly-codegen < %s | FileCheck %s +; +; int jd(int *restrict A, int x, int N) { +; for (int i = 1; i < N; i++) +; for (int j = 3; j < N; j++) +; x += A[i]; +; return x; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define i32 @jd(i32* noalias %A, i32 %x, i32 %N) { +entry: +; CHECK-LABEL: entry: +; CHECK-DAG: %x.addr.1.lcssa.s2a = alloca i32 +; CHECK-DAG: %x.addr.1.lcssa.phiops = alloca i32 +; CHECK-DAG: %x.addr.1.s2a = alloca i32 +; CHECK-DAG: %x.addr.1.phiops = alloca i32 +; CHECK-DAG: %x.addr.0.s2a = alloca i32 +; CHECK-DAG: %x.addr.0.phiops = alloca i32 + %tmp = sext i32 %N to i64 + br label %for.cond + +; CHECK-LABEL: polly.merge_new_and_old: +; CHECK: %x.addr.0.merge = phi i32 [ %x.addr.0.final_reload, %polly.merge ], [ %x.addr.0, %for.cond ] +; CHECK: ret i32 %x.addr.0.merge + +; CHECK-LABEL: polly.start: +; CHECK-NEXT: store i32 %x, i32* %x.addr.0.phiops + +; CHECK-LABEL: polly.merge: +; CHECK: %x.addr.0.final_reload = load i32, i32* %x.addr.0.s2a + +for.cond: ; preds = %for.inc4, %entry +; CHECK-LABEL: polly.stmt.for.cond{{[0-9]*}}: +; CHECK: %x.addr.0.phiops.reload[[R1:[0-9]*]] = load i32, i32* %x.addr.0.phiops +; CHECK: store i32 %x.addr.0.phiops.reload[[R1]], i32* %x.addr.0.s2a + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc4 ], [ 1, %entry ] + %x.addr.0 = phi i32 [ %x, %entry ], [ %x.addr.1.lcssa, %for.inc4 ] + %cmp = icmp slt i64 %indvars.iv, %tmp + br i1 %cmp, label %for.body, label %for.end6 + +; CHECK-LABEL: polly.stmt.for.cond{{[0-9]*}}: +; CHECK: %x.addr.0.phiops.reload[[R1:[0-9]*]] = load i32, i32* %x.addr.0.phiops +; CHECK: store i32 %x.addr.0.phiops.reload[[R1]], i32* %x.addr.0.s2a + +for.body: ; preds = %for.cond +; CHECK-LABEL: polly.stmt.for.body: +; CHECK: %x.addr.0.s2a.reload[[R2:[0-9]*]] = load i32, i32* %x.addr.0.s2a +; CHECK: store i32 %x.addr.0.s2a.reload[[R2]], i32* %x.addr.1.phiops + br label %for.cond1 + +for.end: ; preds = %for.cond1 +; CHECK-LABEL: polly.stmt.for.end: +; CHECK-NEXT: %x.addr.1.lcssa.phiops.reload = load i32, i32* %x.addr.1.lcssa.phiops +; CHECK-NEXT: store i32 %x.addr.1.lcssa.phiops.reload, i32* %x.addr.1.lcssa.s2a[[R4:[0-9]*]] + %x.addr.1.lcssa = phi i32 [ %x.addr.1, %for.cond1 ] + br label %for.inc4 + +for.inc4: ; preds = %for.end +; CHECK-LABEL: polly.stmt.for.inc4: +; CHECK: %x.addr.1.lcssa.s2a.reload[[R5:[0-9]*]] = load i32, i32* %x.addr.1.lcssa.s2a[[R4]] +; CHECK: store i32 %x.addr.1.lcssa.s2a.reload[[R5]], i32* %x.addr.0.phiops + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.cond + +for.cond1: ; preds = %for.inc, %for.body +; CHECK-LABEL: polly.stmt.for.cond1: +; CHECK: %x.addr.1.phiops.reload = load i32, i32* %x.addr.1.phiops +; CHECK: store i32 %x.addr.1.phiops.reload, i32* %x.addr.1.s2a[[R6:[0-9]*]] +; CHECK: store i32 %x.addr.1.phiops.reload, i32* %x.addr.1.lcssa.phiops + %x.addr.1 = phi i32 [ %x.addr.0, %for.body ], [ %add, %for.inc ] + %j.0 = phi i32 [ 3, %for.body ], [ %inc, %for.inc ] + %exitcond = icmp ne i32 %j.0, %N + br i1 %exitcond, label %for.body3, label %for.end + +for.body3: ; preds = %for.cond1 + br label %for.inc + +for.inc: ; preds = %for.body3 +; CHECK-LABEL: polly.stmt.for.inc: +; CHECK: %x.addr.1.s2a.reload[[R3:[0-9]*]] = load i32, i32* %x.addr.1.s2a +; CHECK: %p_add = add nsw i32 %x.addr.1.s2a.reload[[R3]], %tmp1_p_scalar_ +; CHECK: store i32 %p_add, i32* %x.addr.1.phiops + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp1 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %x.addr.1, %tmp1 + %inc = add nsw i32 %j.0, 1 + br label %for.cond1 + +for.end6: ; preds = %for.cond + ret i32 %x.addr.0 +} Index: polly/trunk/test/Isl/CodeGen/phi_scalar_simple_2.ll =================================================================== --- polly/trunk/test/Isl/CodeGen/phi_scalar_simple_2.ll +++ polly/trunk/test/Isl/CodeGen/phi_scalar_simple_2.ll @@ -0,0 +1,108 @@ +; RUN: opt %loadPolly -S -polly-detect-unprofitable -polly-model-phi-nodes -disable-polly-intra-scop-scalar-to-array -polly-no-early-exit -polly-codegen < %s | FileCheck %s +; +; int jd(int *restrict A, int x, int N, int c) { +; for (int i = 0; i < N; i++) +; for (int j = 0; j < N; j++) +; if (i < c) +; x += A[i]; +; return x; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define i32 @jd(i32* noalias %A, i32 %x, i32 %N, i32 %c) { +entry: +; CHECK-LABEL: entry: +; CHECK-DAG: %x.addr.2.s2a = alloca i32 +; CHECK-DAG: %x.addr.2.phiops = alloca i32 +; CHECK-DAG: %x.addr.1.s2a = alloca i32 +; CHECK-DAG: %x.addr.1.phiops = alloca i32 +; CHECK-DAG: %x.addr.0.s2a = alloca i32 +; CHECK-DAG: %x.addr.0.phiops = alloca i32 + %tmp = sext i32 %N to i64 + %tmp1 = sext i32 %c to i64 + br label %for.cond + +; CHECK-LABEL: polly.merge_new_and_old: +; CHECK: %x.addr.0.merge = phi i32 [ %x.addr.0.final_reload, %polly.merge ], [ %x.addr.0, %for.cond ] +; CHECK: ret i32 %x.addr.0.merge + +; CHECK-LABEL: polly.start: +; CHECK-NEXT: store i32 %x, i32* %x.addr.0.phiops + +; CHECK-LABEL: polly.merge: +; CHECK: %x.addr.0.final_reload = load i32, i32* %x.addr.0.s2a + +for.cond: ; preds = %for.inc5, %entry +; CHECK-LABEL: polly.stmt.for.cond{{[0-9]*}}: +; CHECK: %x.addr.0.phiops.reload[[R1:[0-9]*]] = load i32, i32* %x.addr.0.phiops +; CHECK: store i32 %x.addr.0.phiops.reload[[R1]], i32* %x.addr.0.s2a + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc5 ], [ 0, %entry ] + %x.addr.0 = phi i32 [ %x, %entry ], [ %x.addr.1, %for.inc5 ] + %cmp = icmp slt i64 %indvars.iv, %tmp + br i1 %cmp, label %for.body, label %for.end7 + +; CHECK-LABEL: polly.stmt.for.cond{{[0-9]*}}: +; CHECK: %x.addr.0.phiops.reload[[R1:[0-9]*]] = load i32, i32* %x.addr.0.phiops +; CHECK: store i32 %x.addr.0.phiops.reload[[R1]], i32* %x.addr.0.s2a + +for.body: ; preds = %for.cond +; CHECK-LABEL: polly.stmt.for.body: +; CHECK: %x.addr.0.s2a.reload[[R2:[0-9]*]] = load i32, i32* %x.addr.0.s2a +; CHECK: store i32 %x.addr.0.s2a.reload[[R2]], i32* %x.addr.1.phiops + br label %for.cond1 + +for.inc5: ; preds = %for.end +; CHECK-LABEL: polly.stmt.for.inc5: +; CHECK: %x.addr.1.s2a.reload[[R5:[0-9]*]] = load i32, i32* %x.addr.1.s2a +; CHECK: store i32 %x.addr.1.s2a.reload[[R5]], i32* %x.addr.0.phiops + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.cond + +for.cond1: ; preds = %for.inc, %for.body +; CHECK-LABEL: polly.stmt.for.cond1: +; CHECK: %x.addr.1.phiops.reload = load i32, i32* %x.addr.1.phiops +; CHECK: store i32 %x.addr.1.phiops.reload, i32* %x.addr.1.s2a + %x.addr.1 = phi i32 [ %x.addr.0, %for.body ], [ %x.addr.2, %for.inc ] + %j.0 = phi i32 [ 0, %for.body ], [ %inc, %for.inc ] + %exitcond = icmp ne i32 %j.0, %N + br i1 %exitcond, label %for.body3, label %for.end + +for.body3: ; preds = %for.cond1 +; CHECK-LABEL: polly.stmt.for.body3: +; CHECK: %x.addr.1.s2a.reload = load i32, i32* %x.addr.1.s2a +; CHECK: store i32 %x.addr.1.s2a.reload, i32* %x.addr.2.phiops + %cmp4 = icmp slt i64 %indvars.iv, %tmp1 + br i1 %cmp4, label %if.then, label %if.end + +if.end: ; preds = %if.then, %for.body3 +; CHECK-LABEL: polly.stmt.if.end: +; CHECK: %x.addr.2.phiops.reload = load i32, i32* %x.addr.2.phiops +; CHECK: store i32 %x.addr.2.phiops.reload, i32* %x.addr.2.s2a + %x.addr.2 = phi i32 [ %add, %if.then ], [ %x.addr.1, %for.body3 ] + br label %for.inc + +for.inc: ; preds = %if.end +; CHECK-LABEL: polly.stmt.for.inc: +; CHECK: %x.addr.2.s2a.reload[[R3:[0-9]*]] = load i32, i32* %x.addr.2.s2a +; CHECK: store i32 %x.addr.2.s2a.reload[[R3]], i32* %x.addr.1.phiops + %inc = add nsw i32 %j.0, 1 + br label %for.cond1 + +if.then: ; preds = %for.body3 +; CHECK-LABEL: polly.stmt.if.then: +; CHECK: %x.addr.1.s2a.reload[[R5:[0-9]*]] = load i32, i32* %x.addr.1.s2a +; CHECK: %p_add = add nsw i32 %x.addr.1.s2a.reload[[R5]], %tmp2_p_scalar_ +; CHECK: store i32 %p_add, i32* %x.addr.2.phiops + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp2 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %x.addr.1, %tmp2 + br label %if.end + +for.end: ; preds = %for.cond1 + br label %for.inc5 + +for.end7: ; preds = %for.cond + ret i32 %x.addr.0 +} +