Index: include/polly/CodeGen/BlockGenerators.h =================================================================== --- include/polly/CodeGen/BlockGenerators.h +++ include/polly/CodeGen/BlockGenerators.h @@ -419,6 +419,17 @@ /// the original value in the non-optimized SCoP. void createScalarFinalization(Region &R); + /// @brief Recompute all scalars needed in this statement. + /// + /// During SCoP creation scalars can be virtually moved to simplify the SCoP + /// description as well as the dependences. However, they are only moved if + /// we can recompute them in the statements they are used in. This method will + /// perform the recomputation before we clone the original statement into the + /// new, optimized region, thus ensure all scalars are available. + void recomputeDependentScalars(ScopStmt &Stmt, ValueMapT &BBMap, + LoopToScevMapT <S, + isl_id_to_ast_expr *NewAccesses); + /// @brief Try to synthesize a new value /// /// Given an old value, we try to synthesize it in a new context from its @@ -441,7 +452,7 @@ /// @returns o A newly synthesized value. /// o NULL, if synthesizing the value failed. Value *trySynthesizeNewValue(ScopStmt &Stmt, Value *Old, ValueMapT &BBMap, - LoopToScevMapT <S, Loop *L) const; + LoopToScevMapT <S, Loop *L); /// @brief Get the new version of a value. /// @@ -463,21 +474,21 @@ /// @param L The loop that surrounded the instruction that referenced /// this value in the original code. This loop is used to /// evaluate the scalar evolution at the right scope. + /// @param TryOnly Flag to indicate that nullptr is a valid return value + /// if no new value was found. /// /// @returns o The old value, if it is still valid. /// o The new value, if available. - /// o NULL, if no value is found. + /// o NULL, if no value is found and TryOnly is set. + /// o Otherwise a trap is triggered. Value *getNewValue(ScopStmt &Stmt, Value *Old, ValueMapT &BBMap, - LoopToScevMapT <S, Loop *L) const; + LoopToScevMapT <S, Loop *L, bool TryOnly = false); void copyInstScalar(ScopStmt &Stmt, Instruction *Inst, ValueMapT &BBMap, - LoopToScevMapT <S); + LoopToScevMapT <S, bool Recompute = false); - /// @brief Get the innermost loop that surrounds an instruction. - /// - /// @param Inst The instruction for which we get the loop. - /// @return The innermost loop that surrounds the instruction. - Loop *getLoopForInst(const Instruction *Inst); + /// @brief Get the innermost loop that surrounds the statement @p Stmt. + Loop *getLoopForStmt(const ScopStmt &Stmt) const; /// @brief Generate the operand address /// @param NewAccesses A map from memory access ids to new ast expressions, @@ -532,8 +543,13 @@ /// @param NewAccesses A map from memory access ids to new ast expressions, /// which may contain new access expressions for certain /// memory accesses. + /// @param Recompute Flag to indicate the instruction is a scalar that + /// needs to be recomputed in this statement. It basically + /// forces us to copy not only the instruction but also all + /// operands if we cannot find a local or global mapping. void copyInstruction(ScopStmt &Stmt, Instruction *Inst, ValueMapT &BBMap, - LoopToScevMapT <S, isl_id_to_ast_expr *NewAccesses); + LoopToScevMapT <S, isl_id_to_ast_expr *NewAccesses, + bool Recompute = false); /// @brief Helper to get the newest version of @p ScalarValue. /// Index: include/polly/ScopInfo.h =================================================================== --- include/polly/ScopInfo.h +++ include/polly/ScopInfo.h @@ -509,6 +509,12 @@ /// @param SAI Info object for the accessed array. void buildAccessRelation(const ScopArrayInfo *SAI); + /// @brief Copy this memory access into the given statement @p Stmt. + /// + /// @param AccList The list that contains all accesses for @p Stmt. + /// @param Stmt The statement the copied access should reside in. + MemoryAccess *copy(AccFuncSetType &AccList, ScopStmt *Stmt) const; + public: /// @brief Create a new MemoryAccess. /// @@ -532,7 +538,7 @@ ~MemoryAccess(); /// @brief Get the type of a memory access. - enum AccessType getType() { return AccType; } + enum AccessType getType() const { return AccType; } /// @brief Is this a reduction like access? bool isReductionLike() const { return RedType != RT_NONE; } @@ -775,6 +781,9 @@ std::string BaseName; + /// @brief Set of scalar values that need to be recomputed in this statement. + SetVector DependentScalars; + /// Build the statement. //@{ void buildDomain(); @@ -917,7 +926,18 @@ } /// @brief Add @p Access to this statement's list of accesses. - void addAccess(MemoryAccess *Access); + /// + /// @param Access The access to add. + /// @param Front Flag to indicate where the access should be added. + void addAccess(MemoryAccess *Access, bool Front = false); + + /// @brief Remove the memory access @p MA from this statement. + /// + /// @param MA The access to remove. + /// @param OnlyMA Flag to indicate if all accesses caused by the access + /// instruction of @p MA should be removed or only @MA. + /// + void removeMemoryAccess(MemoryAccess *MA, bool OnlyMA); /// @brief Remove the memory access in @p InvMAs. /// @@ -947,6 +967,14 @@ /// @brief Get the isl AST build. __isl_keep isl_ast_build *getAstBuild() const { return Build; } + /// @brief Add a scalar that need to be recomputed in this statement. + void addDependentScalar(Instruction *Inst) { DependentScalars.insert(Inst); } + + /// @brief Return the scalars that need to be recomputed in this statement. + const SetVector &getDependentScalars() const { + return DependentScalars; + } + /// @brief Restrict the domain of the statement. /// /// @param NewDomain The new statement domain. @@ -1238,6 +1266,69 @@ /// @brief Add invariant loads listed in @p InvMAs with the domain of @p Stmt. void addInvariantLoads(ScopStmt &Stmt, MemoryAccessList &InvMAs); + /// @brief Check if we can recompute all instructions in @p Stmt. + /// + /// @param Stmt The statement we want to recompute @p Insts in. + /// @param Insts The instructions we need to recompute. + /// + /// @returns True, if all instructions can be recomputed in @p Stmt. + bool canRecomputeInStmt(ScopStmt &Stmt, SmallPtrSet &Insts); + + /// @brief Type for outside operands and their in SCoP users. + using OutsideOperandsSetTy = + SmallVector, 4>; + + /// @brief Type for a set of instructions. + using InstructionSetTy = SmallPtrSet; + + /// @brief Type for non-trivially copy/recompute-able operands. + /// + /// The first element of the pair contains instructions that are actually not + /// trivially copy/recompute-able, e.g., PHI nodes and loads. The second part + /// contains all in SCoP operands that use outside instructions. They are + /// non-trivial as we need to model the scalar read accesses of the outside + /// operands properly. + using NonTrivialOperandsPairTy = + std::pair; + + /// @brief Map from in SCoP instructions to their non-trivial operands. + DenseMap NonTrivialOperandsMap; + + /// @brief Collect non-trivial operands for all scalar write accesses. + void collectNonTrivialOperands(); + + /// @brief Remove scalar reads that can be recomputed. + /// + /// This function will check if a scalar read can be recomputed instead of + /// reloaded/communicated. If so it will remove that access and add the + /// definition to the ScopStmt::DependentScalars of that statement instead. + /// During code generation these scalars will be recomputed before the + /// instructions in the statement are copied, thus can be used in the + /// statement without the need communicate them through memory. Additionally, + /// this function might introduce new read accesses for operands that are + /// defined outside the SCoP in order to allow e.g., OpenMP code generation + /// easy access to the values needed in a statement. + void removeRecomputableScalarReads(); + + /// @brief Remove scalar writes that are not read in the SCoP or at its exit. + /// + /// When Scop::recomputeScalarReads() removes read accesses the corresponding + /// write accesses might not be needed anymore. To this end, this function + /// removes scalar writes without scalar reads inside the SCoP or an escaping + /// use that would cause a reload and merge at the SCoP exit. + void removeUnreadScalarWrites(); + + /// @brief Simplify the scalar accesses in this SCoP. + /// + /// Scalar accesses are often not needed and only caused by the placement in + /// the code. Additionally it is sometimes possible to recompute scalars to + /// avoid communication. As scalars basically sequentialize all loops they are + /// in, we try to avoid scalar accesses as much as possible. To this end we + /// will virtually move and later recompute them in the code generation. This + /// allows more freedom for the scheduler while we do not need to change the + /// original code region at all. + void simplifyScalarAccesses(); + /// @brief Build the Context of the Scop. void buildContext(); @@ -1497,6 +1588,9 @@ const_reverse_iterator rend() const { return Stmts.rend(); } //@} + /// @brief Remove the ScopArrayInfo object @p SAI. + void removeScopArrayInfo(const ScopArrayInfo *SAI); + /// @brief Return the (possibly new) ScopArrayInfo object for @p Access. /// /// @param ElementType The type of the elements stored in this array. Index: lib/Analysis/ScopInfo.cpp =================================================================== --- lib/Analysis/ScopInfo.cpp +++ lib/Analysis/ScopInfo.cpp @@ -653,6 +653,25 @@ Subscripts(Subscripts.begin(), Subscripts.end()), AccessRelation(nullptr), NewAccessRelation(nullptr) {} +MemoryAccess *MemoryAccess::copy(AccFuncSetType &AccList, + ScopStmt *Stmt) const { + AccList.emplace_back(Stmt, getAccessInstruction(), getId(), getType(), + getBaseAddr(), getElemSizeInBytes(), isAffine(), + Subscripts, Sizes, getAccessValue(), Origin, + getBaseName()); + MemoryAccess *CopyMA = &AccList.back(); + + unsigned Dims = getStatement()->getNumIterators(); + unsigned StmtDims = Stmt->getNumIterators(); + isl_map *AR = getAccessRelation(); + if (StmtDims > Dims) + AR = isl_map_add_dims(AR, isl_dim_in, StmtDims - Dims); + + CopyMA->AccessRelation = + isl_map_set_tuple_id(AR, isl_dim_in, Stmt->getDomainId()); + return CopyMA; +} + void MemoryAccess::realignParams() { isl_space *ParamSpace = Statement->getParent()->getParamSpace(); AccessRelation = isl_map_align_params(AccessRelation, ParamSpace); @@ -826,14 +845,17 @@ } } -void ScopStmt::addAccess(MemoryAccess *Access) { +void ScopStmt::addAccess(MemoryAccess *Access, bool Front) { Instruction *AccessInst = Access->getAccessInstruction(); MemoryAccessList *&MAL = InstructionToAccess[AccessInst]; if (!MAL) MAL = new MemoryAccessList(); MAL->emplace_front(Access); - MemAccs.push_back(MAL->front()); + if (Front) + MemAccs.insert(MemAccs.begin(), MAL->front()); + else + MemAccs.push_back(MAL->front()); } void ScopStmt::realignParams() { @@ -1385,31 +1407,39 @@ void ScopStmt::dump() const { print(dbgs()); } -void ScopStmt::removeMemoryAccesses(MemoryAccessList &InvMAs) { - - // Remove all memory accesses in @p InvMAs from this statement together - // with all scalar accesses that were caused by them. The tricky iteration - // order uses is needed because the MemAccs is a vector and the order in - // which the accesses of each memory access list (MAL) are stored in this - // vector is reversed. - for (MemoryAccess *MA : InvMAs) { - auto &MAL = *lookupAccessesFor(MA->getAccessInstruction()); - MAL.reverse(); - - auto MALIt = MAL.begin(); - auto MALEnd = MAL.end(); - auto MemAccsIt = MemAccs.begin(); - while (MALIt != MALEnd) { - while (*MemAccsIt != *MALIt) - MemAccsIt++; - +void ScopStmt::removeMemoryAccess(MemoryAccess *MA, bool OnlyMA) { + auto &MAL = *lookupAccessesFor(MA->getAccessInstruction()); + MAL.reverse(); + + // The tricky iteration order used is needed because MemAccs is a vector + // and the order in which the accesses of each memory access list (MAL) are + // stored in this vector is reversed. + auto MALIt = MAL.begin(); + auto MALLastIt = MAL.before_begin(); + auto MALEnd = MAL.end(); + auto MemAccsIt = MemAccs.begin(); + while (true) { + + while (OnlyMA && MALIt != MALEnd && (MA != *MALIt)) { + MALLastIt++; MALIt++; - MemAccs.erase(MemAccsIt); } - InstructionToAccess.erase(MA->getAccessInstruction()); - delete &MAL; + if (MALIt == MALEnd) + break; + + while (*MemAccsIt != *MALIt) + MemAccsIt++; + + MemAccs.erase(MemAccsIt); + MALIt = MAL.erase_after(MALLastIt); } + + if (!MAL.empty()) + return; + + InstructionToAccess.erase(MA->getAccessInstruction()); + delete &MAL; } //===----------------------------------------------------------------------===// @@ -2477,6 +2507,7 @@ buildAliasChecks(AA); hoistInvariantLoads(); + simplifyScalarAccesses(); simplifySCoP(false); } @@ -2712,7 +2743,8 @@ InvMAs.reverse(); // Transfer the memory access from the statement to the SCoP. - Stmt.removeMemoryAccesses(InvMAs); + for (MemoryAccess *MA : InvMAs) + Stmt.removeMemoryAccess(MA, false); addInvariantLoads(Stmt, InvMAs); isl_set_free(Domain); @@ -2738,6 +2770,230 @@ } } +bool Scop::canRecomputeInStmt(ScopStmt &Stmt, + SmallPtrSet &Insts) { + if (Insts.empty()) + return true; + + // TODO: Check if we can actually move the instructions. + return false; +} + +void Scop::collectNonTrivialOperands() { + // See: Scop::simplifyScalarAccesses() + + SmallPtrSet Visited; + for (ScopStmt &Stmt : *this) { + for (MemoryAccess *MA : Stmt) { + if (MA->isExplicit() || MA->isRead() || MA->isPHI()) + continue; + + Instruction *AccessInst = MA->getAccessInstruction(); + if (isa(AccessInst)) + AccessInst = cast(MA->getBaseAddr()); + + if (NonTrivialOperandsMap.count(AccessInst)) + continue; + + DEBUG(dbgs() << "\nCheck operand tree of '" << *AccessInst << "'\n"); + + auto &NonTrivialOperands = NonTrivialOperandsMap[AccessInst]; + auto &SideEffectOperands = NonTrivialOperands.first; + auto &OutsideOperands = NonTrivialOperands.second; + + SmallPtrSet Worklist; + Worklist.insert(AccessInst); + Visited.clear(); + + while (!Worklist.empty()) { + Instruction *Inst = *Worklist.begin(); + Worklist.erase(Inst); + + if (!Visited.insert(Inst).second) + continue; + + for (auto &InstOp : Inst->operands()) + if (Instruction *InstOpInst = dyn_cast(InstOp)) { + if (R.contains(InstOpInst)) + Worklist.insert(InstOpInst); + else + OutsideOperands.push_back(std::make_pair(InstOpInst, Inst)); + } + + if (Inst->mayHaveSideEffects() || Inst->mayReadFromMemory()) + SideEffectOperands.insert(Inst); + + if (!isa(Inst)) + continue; + + if (R.contains(Inst) && canSynthesize(Inst, &LI, SE, &R)) + continue; + + SideEffectOperands.insert(Inst); + } + + DEBUG({ + dbgs() << "\tSideEffectOperands: {\n"; + for (auto *Op : SideEffectOperands) + dbgs() << "\t\t" << *Op << "\n"; + dbgs() << "\t}\n"; + dbgs() << "\tOutsideOperands: {\n"; + for (const auto &OpPair : OutsideOperands) { + dbgs() << "\t\t" << *OpPair.first << "\n"; + dbgs() << "\t\tused by " << *OpPair.second << "\n"; + } + dbgs() << "\t}\n\n"; + }); + } + } +} + +void Scop::removeRecomputableScalarReads() { + // See: Scop::simplifyScalarAccesses() + + for (ScopStmt &Stmt : *this) { + BasicBlock *StmtBB = Stmt.isBlockStmt() ? Stmt.getBasicBlock() + : Stmt.getRegion()->getEntry(); + AccFuncSetType &AccList = AccFuncMap[StmtBB]; + + SmallVector AdditionalAccesses; + SmallVector ResolvedAccesses; + + for (MemoryAccess *MA : Stmt) { + if (MA->isExplicit() || MA->isWrite() || MA->isPHI()) + continue; + + Instruction *DefInst = dyn_cast(MA->getAccessValue()); + if (!DefInst) + continue; + + ScopStmt *DefStmt = getStmtForBasicBlock(DefInst->getParent()); + if (!DefStmt) + continue; + + auto &NonTrivialOperands = NonTrivialOperandsMap[DefInst]; + auto &SideEffectOperands = NonTrivialOperands.first; + if (!canRecomputeInStmt(Stmt, SideEffectOperands)) + continue; + + auto &OutsideOperands = NonTrivialOperands.second; + for (auto &OutsideOperandPair : OutsideOperands) { + Instruction *OutsideOperand = OutsideOperandPair.first; + Instruction *OutsideUser = OutsideOperandPair.second; + ScopStmt *UserStmt = getStmtForBasicBlock(OutsideUser->getParent()); + assert(UserStmt); + auto *UseMAs = UserStmt->lookupAccessesFor(OutsideUser); + if (!UseMAs) + continue; + + for (const MemoryAccess *UseMA : *UseMAs) + if (UseMA->getBaseAddr() == OutsideOperand) + AdditionalAccesses.push_back(UseMA->copy(AccList, &Stmt)); + } + + Stmt.addDependentScalar(DefInst); + ResolvedAccesses.push_back(MA); + } + + for (MemoryAccess *MA : ResolvedAccesses) + Stmt.removeMemoryAccess(MA, true); + for (MemoryAccess *MA : AdditionalAccesses) + Stmt.addAccess(MA, true); + } +} + +void Scop::removeUnreadScalarWrites() { + // See: Scop::simplifyScalarAccesses() + + for (ScopStmt &Stmt : *this) { + SmallVector ResolvedAccesses; + for (MemoryAccess *MA : Stmt) { + if (MA->isExplicit() || MA->isRead() || MA->isPHI()) + continue; + + Instruction *AccessInst = MA->getAccessInstruction(); + if (isa(AccessInst)) + AccessInst = cast(MA->getBaseAddr()); + + if (!R.contains(AccessInst)) + continue; + + bool AllUsersRemoved = true; + for (auto *User : AccessInst->users()) { + auto *UserInst = cast(User); + + auto *UserStmt = getStmtForBasicBlock(UserInst->getParent()); + if (!UserStmt) { + AllUsersRemoved = false; + break; + } + + auto *UserMAs = UserStmt->lookupAccessesFor(UserInst); + if (!UserMAs) + continue; + + for (auto *UserMA : *UserMAs) { + if (UserMA->isExplicit() || UserMA->isWrite()) + continue; + + AllUsersRemoved = false; + break; + } + } + + if (!AllUsersRemoved) + continue; + + ResolvedAccesses.push_back(MA); + } + + for (MemoryAccess *MA : ResolvedAccesses) { + // Remove the ScopArrayInfo object for this scalar as we do neither + // define nor read it in the SCoP description but only copy the original + // version in the unoptimized region. + const ScopArrayInfo *SAI = MA->getScopArrayInfo(); + removeScopArrayInfo(SAI); + + Stmt.removeMemoryAccess(MA, true); + } + } +} + +void Scop::simplifyScalarAccesses() { + // First iterate over all implicit write accesses, hence scalar definitions + // and collect all operands that might have side effects or read memory as + // well as all operands that are outside the SCoP. The former is needed to + // decide if we can recompute the scalar definition to another statement. + // The latter to add read-only scalar accesses to the statement in which the + // scalar is recomputed. That allows us to identify values needed e.g., for + // parallel code generation. + collectNonTrivialOperands(); + + // In the second step traverse all implicit read accesses, hence scalar uses + // in statements that do not define the scalar. However, at the moment we + // exclude PHIs to simplify the logic. For each use we will check if we can + // recompute the definition to this block (see canRecomputeInStmt()). + // If so we will: + // o Add the definition to the dependent scalars set of the use statement. + // o Add read accesses for all prior to the SCoP defined values if they + // were present in the definition statement. + // o Remove the use access from the use statement as it will be recomputed + // and does not need to be communicated anymore. + removeRecomputableScalarReads(); + + // In the third and final step we iterate over the scalar definitions in + // the SCoP again. We will check if we removed the accesses for all users + // of the scalar in the SCoP. If so, we can safely remove the scalar write + // access as all users will recompute the value. As we currently cannot simply + // use this logic to recompute values at the exit of the SCoP we will not + // remove scalars that escape the SCoP. + removeUnreadScalarWrites(); +} + +void Scop::removeScopArrayInfo(const ScopArrayInfo *SAI) { + ScopArrayInfoMap.erase(std::make_pair(SAI->getBasePtr(), SAI->isPHI())); +} + const ScopArrayInfo * Scop::getOrCreateScopArrayInfo(Value *BasePtr, Type *AccessType, ArrayRef Sizes, bool IsPHI) { Index: lib/CodeGen/BlockGenerators.cpp =================================================================== --- lib/CodeGen/BlockGenerators.cpp +++ lib/CodeGen/BlockGenerators.cpp @@ -59,14 +59,33 @@ EntryBB(nullptr), PHIOpMap(PHIOpMap), ScalarMap(ScalarMap), EscapeMap(EscapeMap), GlobalMap(GlobalMap) {} +void BlockGenerator::recomputeDependentScalars( + ScopStmt &Stmt, ValueMapT &BBMap, LoopToScevMapT <S, + isl_id_to_ast_expr *NewAccesses) { + + for (auto *Inst : Stmt.getDependentScalars()) + if (!GlobalMap.count(Inst)) + copyInstruction(Stmt, Inst, BBMap, LTS, NewAccesses, true); +} + Value *BlockGenerator::trySynthesizeNewValue(ScopStmt &Stmt, Value *Old, ValueMapT &BBMap, - LoopToScevMapT <S, - Loop *L) const { + LoopToScevMapT <S, Loop *L) { if (SE.isSCEVable(Old->getType())) if (const SCEV *Scev = SE.getSCEVAtScope(const_cast(Old), L)) { if (!isa(Scev)) { const SCEV *NewScev = apply(Scev, LTS, SE); + + // Recompute scalars needed for this SCEV + const Region &R = Stmt.getParent()->getRegion(); + SetVector Values; + findValues(NewScev, Values); + for (Value *Val : Values) { + if (Instruction *Inst = dyn_cast(Val)) + if (R.contains(Inst)) + copyInstruction(Stmt, Inst, BBMap, LTS, nullptr, true); + } + ValueMapT VTV; VTV.insert(BBMap.begin(), BBMap.end()); VTV.insert(GlobalMap.begin(), GlobalMap.end()); @@ -80,7 +99,6 @@ "Only instructions can be insert points for SCEVExpander"); Value *Expanded = expandCodeFor(S, SE, DL, "polly", NewScev, Old->getType(), IP, &VTV); - BBMap[Old] = Expanded; return Expanded; } @@ -90,7 +108,7 @@ } Value *BlockGenerator::getNewValue(ScopStmt &Stmt, Value *Old, ValueMapT &BBMap, - LoopToScevMapT <S, Loop *L) const { + LoopToScevMapT <S, Loop *L, bool TryOnly) { // We assume constants never change. // This avoids map lookups for many calls to this function. if (isa(Old)) @@ -109,8 +127,9 @@ if (Value *New = BBMap.lookup(Old)) return New; - if (Value *New = trySynthesizeNewValue(Stmt, Old, BBMap, LTS, L)) - return New; + if (canSynthesize(Old, &LI, &SE, &Stmt.getParent()->getRegion())) + if (Value *New = trySynthesizeNewValue(Stmt, Old, BBMap, LTS, L)) + return New; // A scop-constant value defined by a global or a function parameter. if (isa(Old) || isa(Old)) @@ -121,29 +140,44 @@ if (!Stmt.getParent()->getRegion().contains(Inst->getParent())) return const_cast(Old); + if (TryOnly) + return nullptr; + // The scalar dependence is neither available nor SCEVCodegenable. llvm_unreachable("Unexpected scalar dependence in region!"); - return nullptr; } void BlockGenerator::copyInstScalar(ScopStmt &Stmt, Instruction *Inst, - ValueMapT &BBMap, LoopToScevMapT <S) { + ValueMapT &BBMap, LoopToScevMapT <S, + bool Recompute) { // We do not generate debug intrinsics as we did not investigate how to // copy them correctly. At the current state, they just crash the code // generation as the meta-data operands are not correctly copied. if (isa(Inst)) return; + const Region &R = Stmt.getParent()->getRegion(); Instruction *NewInst = Inst->clone(); // Replace old operands with the new ones. for (Value *OldOperand : Inst->operands()) { - Value *NewOperand = - getNewValue(Stmt, OldOperand, BBMap, LTS, getLoopForInst(Inst)); + Value *NewOperand = getNewValue(Stmt, OldOperand, BBMap, LTS, + getLoopForStmt(Stmt), Recompute); + + if (Recompute) { + Instruction *NewOperandInst = dyn_cast_or_null(NewOperand); + if (!NewOperand || (NewOperandInst && R.contains(NewOperandInst))) { + if (Instruction *OldOperandInst = dyn_cast(OldOperand)) { + copyInstruction(Stmt, OldOperandInst, BBMap, LTS, nullptr, Recompute); + NewOperand = BBMap[OldOperand]; + } + } + } if (!NewOperand) { assert(!isa(NewInst) && "Store instructions are always needed!"); + assert(!Recompute && "Recompute copy should never fail"); delete NewInst; return; } @@ -186,11 +220,13 @@ return Address; } - return getNewValue(Stmt, Pointer, BBMap, LTS, getLoopForInst(Inst)); + return getNewValue(Stmt, Pointer, BBMap, LTS, getLoopForStmt(Stmt)); } -Loop *BlockGenerator::getLoopForInst(const llvm::Instruction *Inst) { - return LI.getLoopFor(Inst->getParent()); +Loop *BlockGenerator::getLoopForStmt(const ScopStmt &Stmt) const { + auto *StmtBB = + Stmt.isBlockStmt() ? Stmt.getBasicBlock() : Stmt.getRegion()->getEntry(); + return LI.getLoopFor(StmtBB); } Value *BlockGenerator::generateScalarLoad(ScopStmt &Stmt, LoadInst *Load, @@ -219,7 +255,7 @@ Value *NewPointer = generateLocationAccessed(Stmt, Store, Pointer, BBMap, LTS, NewAccesses); Value *ValueOperand = getNewValue(Stmt, Store->getValueOperand(), BBMap, LTS, - getLoopForInst(Store)); + getLoopForStmt(Stmt)); if (DebugPrinting) RuntimeDebugBuilder::createCPUPrinter(Builder, "Store to ", NewPointer, @@ -230,13 +266,15 @@ void BlockGenerator::copyInstruction(ScopStmt &Stmt, Instruction *Inst, ValueMapT &BBMap, LoopToScevMapT <S, - isl_id_to_ast_expr *NewAccesses) { + isl_id_to_ast_expr *NewAccesses, + bool Recompute) { + // Terminator instructions control the control flow. They are explicitly // expressed in the clast and do not need to be copied. if (Inst->isTerminator()) return; - Loop *L = getLoopForInst(Inst); + Loop *L = getLoopForStmt(Stmt); if ((Stmt.isBlockStmt() || !Stmt.getRegion()->contains(L)) && canSynthesize(Inst, &LI, &SE, &Stmt.getParent()->getRegion())) { // Synthesizable statements will be generated on-demand. @@ -266,7 +304,7 @@ if (isIgnoredIntrinsic(Inst)) return; - copyInstScalar(Stmt, Inst, BBMap, LTS); + copyInstScalar(Stmt, Inst, BBMap, LTS, Recompute); } void BlockGenerator::copyStmt(ScopStmt &Stmt, LoopToScevMapT <S, @@ -292,6 +330,9 @@ isl_id_to_ast_expr *NewAccesses) { BasicBlock *CopyBB = splitBB(BB); Builder.SetInsertPoint(CopyBB->begin()); + + recomputeDependentScalars(Stmt, BBMap, LTS, NewAccesses); + generateScalarLoads(Stmt, BBMap); copyBB(Stmt, BB, CopyBB, BBMap, LTS, NewAccesses); @@ -430,8 +471,8 @@ if ((Stmt.isBlockStmt() && Stmt.getBasicBlock() == ScalarValueInst->getParent()) || (Stmt.isRegionStmt() && Stmt.getRegion()->contains(ScalarValueInst))) { - auto SynthesizedValue = trySynthesizeNewValue( - Stmt, ScalarValueInst, BBMap, LTS, getLoopForInst(ScalarValueInst)); + auto SynthesizedValue = trySynthesizeNewValue(Stmt, ScalarValueInst, BBMap, + LTS, getLoopForStmt(Stmt)); if (SynthesizedValue) return SynthesizedValue; @@ -793,7 +834,7 @@ VectorValueMapT &ScalarMaps) { int VectorWidth = getVectorWidth(); Value *NewOperand = getVectorValue(Stmt, Inst->getOperand(0), VectorMap, - ScalarMaps, getLoopForInst(Inst)); + ScalarMaps, getLoopForStmt(Stmt)); assert(isa(Inst) && "Can not generate vector code for instruction"); @@ -805,7 +846,7 @@ void VectorBlockGenerator::copyBinaryInst(ScopStmt &Stmt, BinaryOperator *Inst, ValueMapT &VectorMap, VectorValueMapT &ScalarMaps) { - Loop *L = getLoopForInst(Inst); + Loop *L = getLoopForStmt(Stmt); Value *OpZero = Inst->getOperand(0); Value *OpOne = Inst->getOperand(1); @@ -825,7 +866,7 @@ auto *Pointer = Store->getPointerOperand(); Value *Vector = getVectorValue(Stmt, Store->getValueOperand(), VectorMap, - ScalarMaps, getLoopForInst(Store)); + ScalarMaps, getLoopForStmt(Stmt)); // Make sure we have scalar values available to access the pointer to // the data location. @@ -1212,7 +1253,7 @@ auto OldIP = Builder.GetInsertPoint(); Builder.SetInsertPoint(BBCopy->getTerminator()); - OpCopy = getNewValue(Stmt, Op, BBCopyMap, LTS, getLoopForInst(PHI)); + OpCopy = getNewValue(Stmt, Op, BBCopyMap, LTS, getLoopForStmt(Stmt)); Builder.SetInsertPoint(OldIP); } else { Index: test/Isl/CodeGen/OpenMP/invariant_base_pointer_preloaded_different_bb.ll =================================================================== --- test/Isl/CodeGen/OpenMP/invariant_base_pointer_preloaded_different_bb.ll +++ test/Isl/CodeGen/OpenMP/invariant_base_pointer_preloaded_different_bb.ll @@ -7,8 +7,8 @@ ; for (int i = 1; i < 1000; i++) ; A[i] += /* split bb */ A[0]; ; } -; A[0] tmp (unused) A -; CHECK: %polly.par.userContext = alloca { float, float*, float* } +; A[0] A +; CHECK: %polly.par.userContext = alloca { float, float* } ; ; CHECK: %polly.subfn.storeaddr.polly.access.A.load = getelementptr inbounds ; CHECK: store float %polly.access.A.load, float* %polly.subfn.storeaddr.polly.access.A.load Index: test/Isl/CodeGen/eliminate-multiple-scalar-fp-reads.ll =================================================================== --- /dev/null +++ test/Isl/CodeGen/eliminate-multiple-scalar-fp-reads.ll @@ -0,0 +1,90 @@ +; RUN: opt %loadPolly -polly-scops -analyze < %s | FileCheck %s --check-prefix=SCOP +; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s +; +; SCOP-NOT: Scalar: 1 +; SCOP-NOT: ReadAccess +; +; Verify the original region is untouched but all computation is moved to the +; only place it is needed in the generated region. +; +; CHECK: for.body.f: +; CHECK-NEXT: %idxprom = sext i32 %i.0 to i64 +; CHECK-NEXT: %arrayidx = getelementptr inbounds float, float* %A, i64 %idxprom +; CHECK-NEXT: store float %add5, float* %arrayidx +; +; CHECK: polly.stmt.for.body.f: +; CHECK: %0 = trunc i64 %polly.indvar to i32 +; CHECK: %1 = shl i32 %0, 1 +; CHECK: %p_conv = sitofp i32 %1 to float +; CHECK: %p_add = fadd float %p_conv, %p_conv +; CHECK: %p_add3 = fadd float %p_conv, %p_add +; CHECK: %p_add1 = fadd float %p_add, %p_conv +; CHECK: %p_add4 = fadd float %p_add3, %p_add1 +; CHECK: %p_add2 = fadd float %p_conv, %p_conv +; CHECK: %p_add5 = fadd float %p_add4, %p_add2 +; CHECK: %scevgep = getelementptr float, float* %A, i64 %polly.indvar +; CHECK: store float %p_add5, float* %scevgep +; +; void f(float *A) { +; for (int i = 0; i < 1000; i++) { +; float a = i * 2; +; /* split BB */ +; float b = a + a; +; /* split BB */ +; float c = b + a; +; /* split BB */ +; float d = a + a; +; /* split BB */ +; float e = a + b + c + d; +; /* split BB */ +; A[i] = e; +; } +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(float* %A) { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, 1000 + br i1 %cmp, label %for.body.a, label %for.end + +for.body.a: + %mul = mul nsw i32 %i.0, 2 + %conv = sitofp i32 %mul to float + br label %for.body.b + +for.body.b: + %add = fadd float %conv, %conv + br label %for.body.c + +for.body.c: + %add1 = fadd float %add, %conv + br label %for.body.d + +for.body.d: + %add2 = fadd float %conv, %conv + br label %for.body.e + +for.body.e: + %add3 = fadd float %conv, %add + %add4 = fadd float %add3, %add1 + %add5 = fadd float %add4, %add2 + br label %for.body.f + +for.body.f: + %idxprom = sext i32 %i.0 to i64 + %arrayidx = getelementptr inbounds float, float* %A, i64 %idxprom + store float %add5, float* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} Index: test/Isl/CodeGen/eliminate-multiple-scalar-reads.ll =================================================================== --- /dev/null +++ test/Isl/CodeGen/eliminate-multiple-scalar-reads.ll @@ -0,0 +1,82 @@ +; RUN: opt %loadPolly -polly-scops -analyze < %s | FileCheck %s --check-prefix=SCOP +; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s +; +; SCOP-NOT: Scalar: 1 +; SCOP-NOT: ReadAccess +; +; Verify the original region is untouched but all computation is moved to the +; only place it is needed in the generated region. +; +; CHECK: for.body.f: +; CHECK-NEXT: %idxprom6 = sext i32 %i.0 to i64 +; CHECK-NEXT: %arrayidx7 = getelementptr inbounds i32, i32* %A, i64 %idxprom6 +; CHECK-NEXT: store i32 %add5, i32* %arrayidx7, align 4 +; +; CHECK: polly.stmt.for.body.f: +; CHECK: %scevgep = getelementptr i32, i32* %A, i64 %polly.indvar +; CHECK: %0 = trunc i64 %polly.indvar to i32 +; CHECK: %1 = shl i32 %0, 4 +; CHECK: store i32 %1, i32* %scevgep +; +; void f(int *A) { +; for (int i = 0; i < 1000; i++) { +; int a = i * 2; +; /* split BB */ +; int b = a + a; +; /* split BB */ +; int c = b + a; +; /* split BB */ +; int d = a + a; +; /* split BB */ +; int e = a + b + c + d; +; /* split BB */ +; A[i] = e; +; } +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* %A) { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, 1000 + br i1 %cmp, label %for.body.a, label %for.end + +for.body.a: ; preds = %for.cond + %tmp = mul nsw i32 %i.0, 2 + br label %for.body.b + +for.body.b: + %add = add nsw i32 %tmp, %tmp + br label %for.body.c + +for.body.c: + %add1 = add nsw i32 %add, %tmp + br label %for.body.d + +for.body.d: + %add2 = add nsw i32 %tmp, %tmp + br label %for.body.e + +for.body.e: + %add3 = add nsw i32 %tmp, %add + %add4 = add nsw i32 %add3, %add1 + %add5 = add nsw i32 %add4, %add2 + br label %for.body.f + +for.body.f: + %idxprom6 = sext i32 %i.0 to i64 + %arrayidx7 = getelementptr inbounds i32, i32* %A, i64 %idxprom6 + store i32 %add5, i32* %arrayidx7, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} Index: test/Isl/CodeGen/eliminate-scalars-with-outside-load.ll =================================================================== --- /dev/null +++ test/Isl/CodeGen/eliminate-scalars-with-outside-load.ll @@ -0,0 +1,61 @@ +; RUN: opt %loadPolly -polly-scops -analyze < %s | FileCheck %s +; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s --check-prefix=CODEGEN +; +; Verify that we will virtually move %mul but also the read of %tmp to the +; for.body.split block. +; +; CHECK: Stmt_for_body_split +; CHECK-NOT: MemRef_mul +; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 1] +; CHECK: { Stmt_for_body_split[i0] -> MemRef_tmp[] }; +; CHECK-NOT: MemRef_mul +; CHECK: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK: { Stmt_for_body_split[i0] -> MemRef_A[i0] }; +; CHECK-NOT: MemRef_mul +; +; CODEGEN: polly.stmt.for.body.split: +; CODEGEN-NEXT: %p_mul = fmul float %tmp, 2.000000e+00 +; // Unused: +; CODEGEN-NEXT: %tmp.s2a.reload = load float, float* %tmp.s2a +; +; CODEGEN-NEXT: %scevgep = getelementptr float, float* %A, i64 %polly.indvar +; CODEGEN-NEXT: store float %p_mul, +; +; void f(float *A) { +; float x = A[-1]; +; for (int i = 0; i < 1000; i++) { +; float a = x * 2; +; /* split BB */ +; A[i] = a; +; } +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(float* %A) { +entry: + %arrayidx = getelementptr inbounds float, float* %A, i64 -1 + %tmp = load float, float* %arrayidx, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ] + %exitcond = icmp ne i64 %indvars.iv, 1000 + br i1 %exitcond, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %mul = fmul float %tmp, 2.000000e+00 + br label %for.body.split + +for.body.split: ; preds = %for.cond + %arrayidx1 = getelementptr inbounds float, float* %A, i64 %indvars.iv + store float %mul, float* %arrayidx1, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} Index: test/Isl/CodeGen/srem-in-other-bb.ll =================================================================== --- test/Isl/CodeGen/srem-in-other-bb.ll +++ test/Isl/CodeGen/srem-in-other-bb.ll @@ -1,18 +1,13 @@ -; RUN: opt %loadPolly -polly-codegen -S \ -; RUN: < %s | FileCheck %s +; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s ; ; void pos(float *A, long n) { ; for (long i = 0; i < 100; i++) ; A[n % 42] += 1; ; } ; -; CHECK: polly.stmt.bb2: -; CHECK-NEXT: %p_tmp = srem i64 %n, 42 -; CHECK-NEXT: store i64 %p_tmp, i64* %tmp.s2a -; -; CHECK: polly.stmt.bb3: -; CHECK: %tmp.s2a.reload = load i64, i64* %tmp.s2a -; CHECK: %p_tmp3 = getelementptr inbounds float, float* %A, i64 %tmp.s2a.reload +; CHECK: polly.stmt.bb3: +; CHECK: %[[rem:[._a-zA-Z0-9]*]] = srem i64 %n, 42 +; CHECK: getelementptr inbounds float, float* %A, i64 %[[rem]] define void @pos(float* %A, i64 %n) { bb: Index: test/ScopInfo/eliminate-scalar-caused-by-load-reduction-2.ll =================================================================== --- /dev/null +++ test/ScopInfo/eliminate-scalar-caused-by-load-reduction-2.ll @@ -0,0 +1,56 @@ +; RUN: opt %loadPolly -polly-scops -analyze < %s | FileCheck %s +; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s --check-prefix=CODEGEN +; +; This is a negative test. We should move the load to the split block +; and remove all scalar accesses, however at the moment we only move +; instructions that are trivially safe to move. All three checks should +; be negated at some point. This also checks that we currently not try to +; move a part of the scalar operand chain, i.e., the %add instruction. +; +; CHECK: Scalar: 1 +; CHECK: Scalar: 1 +; CHECK-NOT: Reduction: + +; +; These checks should stay as they verify we did not modify the original region: +; +; CODEGEN: for.body.split: +; CODEGEN-NEXT: %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv +; CODEGEN-NEXT: store i32 %add, i32* %arrayidx2, align 4 +; +; void f(int *A) { +; for (int i = 0; i < 1000; i++) { +; int x = A[i] + 3; +; /* split BB */ +; A[i] = x; +; } +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* %A) { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ] + %exitcond = icmp ne i64 %indvars.iv, 1000 + br i1 %exitcond, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %tmp, 3 + br label %for.body.split + +for.body.split: + %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 %add, i32* %arrayidx2, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} Index: test/ScopInfo/eliminate-scalar-caused-by-load-reduction.ll =================================================================== --- /dev/null +++ test/ScopInfo/eliminate-scalar-caused-by-load-reduction.ll @@ -0,0 +1,48 @@ +; RUN: opt %loadPolly -polly-scops -analyze < %s | FileCheck %s +; +; This is a negative test. We should move the load to the split block +; and remove all scalar accesses, however at the moment we only move +; instructions that are trivially safe to move. All three checks should +; be negated at some point. +; +; CHECK: Scalar: 1 +; CHECK: Scalar: 1 +; CHECK-NOT: Reduction: + +; +; void f(int *A) { +; for (int i = 0; i < 1000; i++) { +; int x = A[i]; +; /* split BB */ +; A[i] = x + 3; +; } +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* %A) { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ] + %exitcond = icmp ne i64 %indvars.iv, 1000 + br i1 %exitcond, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp = load i32, i32* %arrayidx, align 4 + br label %for.body.split + +for.body.split: + %add = add nsw i32 %tmp, 3 + %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 %add, i32* %arrayidx2, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} Index: test/ScopInfo/inter_bb_scalar_dep.ll =================================================================== --- test/ScopInfo/inter_bb_scalar_dep.ll +++ test/ScopInfo/inter_bb_scalar_dep.ll @@ -1,4 +1,5 @@ ; RUN: opt %loadPolly -basicaa -polly-scops -analyze < %s | FileCheck %s +; RUN: opt %loadPolly -basicaa -polly-codegen -analyze < %s ; void f(long A[], int N, int *init_ptr) { ; long i, j; @@ -36,10 +37,11 @@ %indvar.j = phi i64 [ 0, %entry.next ], [ %indvar.j.next, %for.j ] %init_plus_two = add i64 %init, 2 ; CHECK-LABEL: Stmt_for_j -; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 1] -; CHECK-NEXT: [N] -> { Stmt_for_j[i0, i1] -> MemRef_init[] }; +; CHECK-NOT: ReadAccess := [Reduction Type: NONE] [Scalar: 1] +; CHECK-NOT: [N] -> { Stmt_for_j[i0, i1] -> MemRef_init[] }; ; CHECK: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0] ; CHECK-NEXT: [N] -> { Stmt_for_j[i0, i1] -> MemRef_A[i1] }; +; CHECK-NOT: ReadAccess := [Reduction Type: NONE] [Scalar: 1] %scevgep = getelementptr i64, i64* %A, i64 %indvar.j store i64 %init_plus_two, i64* %scevgep %indvar.j.next = add nsw i64 %indvar.j, 1 Index: test/ScopInfo/intra_and_inter_bb_scalar_dep.ll =================================================================== --- test/ScopInfo/intra_and_inter_bb_scalar_dep.ll +++ test/ScopInfo/intra_and_inter_bb_scalar_dep.ll @@ -39,10 +39,10 @@ %init_2 = load i64, i64* %init_ptr %init_sum = add i64 %init, %init_2 ; CHECK: Stmt_for_j -; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 1] -; CHECK-NEXT: [N] -> { Stmt_for_j[i0, i1] -> MemRef_init[] }; -; CHECK-NEXT: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK-NOT: [Scalar: 1] +; CHECK: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0] ; CHECK-NEXT: [N] -> { Stmt_for_j[i0, i1] -> MemRef_A[i1] }; +; CHECK-NOT: [Scalar: 1] %scevgep = getelementptr i64, i64* %A, i64 %indvar.j store i64 %init_sum, i64* %scevgep %indvar.j.next = add nsw i64 %indvar.j, 1 Index: test/ScopInfo/out-of-scop-use-in-region-entry-phi-node.ll =================================================================== --- test/ScopInfo/out-of-scop-use-in-region-entry-phi-node.ll +++ test/ScopInfo/out-of-scop-use-in-region-entry-phi-node.ll @@ -1,7 +1,7 @@ ; RUN: opt %loadPolly -polly-scops -analyze < %s | FileCheck %s ; CHECK: MustWriteAccess := [Reduction Type: NONE] [Scalar: 1] -; CHECK-NEXT: [p_0] -> { Stmt_bb3[] -> MemRef_tmp5[] }; +; CHECK-NEXT: [p_0] -> { Stmt_bb3[] -> MemRef_tmp7[] }; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" Index: test/ScopInfo/scalar_dependence_cond_br.ll =================================================================== --- test/ScopInfo/scalar_dependence_cond_br.ll +++ test/ScopInfo/scalar_dependence_cond_br.ll @@ -6,13 +6,11 @@ ; A[i]++; ; } ; -; FIXME: This test is a negative test until we have an independent blocks alternative. -; ; We should move operands as close to their use as possible, hence in this case -; there should not be any scalar dependence anymore after %cmp1 is moved to -; %for.body (%c and %indvar.iv are synthesis able). +; there should not be any scalar dependence anymore after %cmp1 is virtually +; moved to %for.body (%c and %indvar.iv are synthesis able). ; -; CHECK: [Scalar: 1] +; CHECK-NOT: [Scalar: 1] ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" Index: test/ScopInfo/tempscop-printing.ll =================================================================== --- test/ScopInfo/tempscop-printing.ll +++ test/ScopInfo/tempscop-printing.ll @@ -1,4 +1,5 @@ ; RUN: opt %loadPolly -basicaa -polly-scops -analyze < %s | FileCheck %s +; RUN: opt %loadPolly -basicaa -polly-codegen -analyze < %s ; void f(long A[], int N, int *init_ptr) { ; long i, j; @@ -35,10 +36,11 @@ for.j: %indvar.j = phi i64 [ 0, %entry.next ], [ %indvar.j.next, %for.j ] ; CHECK: Stmt_for_j -; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 1] -; CHECK-NEXT: [N] -> { Stmt_for_j[i0, i1] -> MemRef_init[] }; +; CHECK-NOT: ReadAccess := [Reduction Type: NONE] [Scalar: 1] +; CHECK-NOT: [N] -> { Stmt_for_j[i0, i1] -> MemRef_init[] }; ; CHECK: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0] ; CHECK-NEXT: [N] -> { Stmt_for_j[i0, i1] -> MemRef_A[i1] }; +; CHECK-NOT: ReadAccess := [Reduction Type: NONE] [Scalar: 1] %init_plus_two = add i64 %init, 2 %scevgep = getelementptr i64, i64* %A, i64 %indvar.j store i64 %init_plus_two, i64* %scevgep @@ -77,10 +79,11 @@ %indvar.j = phi i64 [ 0, %entry.next ], [ %indvar.j.next, %for.j ] %scevgep = getelementptr i64, i64* %A, i64 %indvar.j store i64 %init, i64* %scevgep -; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 1] -; CHECK-NEXT: [N] -> { Stmt_for_j[i0, i1] -> MemRef_init[] }; +; CHECK-NOT: ReadAccess := [Reduction Type: NONE] [Scalar: 1] +; CHECK-NOT: [N] -> { Stmt_for_j[i0, i1] -> MemRef_init[] }; ; CHECK: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0] ; CHECK-NEXT: [N] -> { Stmt_for_j[i0, i1] -> MemRef_A[i1] }; +; CHECK-NOT: ReadAccess := [Reduction Type: NONE] [Scalar: 1] %indvar.j.next = add nsw i64 %indvar.j, 1 %exitcond.j = icmp eq i64 %indvar.j.next, %N br i1 %exitcond.j, label %for.i.end, label %for.j Index: test/ScopInfo/unneeded_scalar_dependences-1.ll =================================================================== --- /dev/null +++ test/ScopInfo/unneeded_scalar_dependences-1.ll @@ -0,0 +1,63 @@ +; RUN: opt %loadPolly -polly-scops -analyze < %s | FileCheck %s +; RUN: opt %loadPolly -polly-codegen -analyze < %s +; +; CHECK-NOT: Scalar: 1 +; +; void f(int *A, int N) { +; for (int i = 0; i < N; i++) { +; int x = i + 1; +; for (int j = 0; j < N; j++) +; A[i] += A[j]; +; A[i] += x; +; } +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* %A, i32 %N) { +entry: + %tmp = sext i32 %N to i64 + br label %for.cond + +for.cond: ; preds = %for.inc.10, %entry + %indvars.iv1 = phi i64 [ %indvars.iv.next2, %for.inc.10 ], [ 0, %entry ] + %indvars.iv.next2 = add nuw nsw i64 %indvars.iv1, 1 + %tmp6 = trunc i64 %indvars.iv.next2 to i32 + %cmp = icmp slt i64 %indvars.iv1, %tmp + br i1 %cmp, label %for.body, label %for.end.12 + +for.body: ; preds = %for.cond + br label %for.cond.1 + +for.cond.1: ; preds = %for.inc, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.body ] + %lftr.wideiv = trunc i64 %indvars.iv to i32 + %exitcond = icmp ne i32 %lftr.wideiv, %N + br i1 %exitcond, label %for.body.3, label %for.end + +for.body.3: ; preds = %for.cond.1 + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp3 = load i32, i32* %arrayidx, align 4 + %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv1 + %tmp4 = load i32, i32* %arrayidx5, align 4 + %add6 = add nsw i32 %tmp4, %tmp3 + store i32 %add6, i32* %arrayidx5, align 4 + br label %for.inc + +for.inc: ; preds = %for.body.3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.cond.1 + +for.end: ; preds = %for.cond.1 + %arrayidx8 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv1 + %tmp5 = load i32, i32* %arrayidx8, align 4 + %add9 = add nsw i32 %tmp5, %tmp6 + store i32 %add9, i32* %arrayidx8, align 4 + br label %for.inc.10 + +for.inc.10: ; preds = %for.end + br label %for.cond + +for.end.12: ; preds = %for.cond + ret void +} Index: test/ScopInfo/unneeded_scalar_dependences-2.ll =================================================================== --- /dev/null +++ test/ScopInfo/unneeded_scalar_dependences-2.ll @@ -0,0 +1,65 @@ +; RUN: opt %loadPolly -polly-scops -analyze < %s | FileCheck %s +; RUN: opt %loadPolly -polly-codegen -analyze < %s +; +; CHECK-NOT: Scalar: 1 +; +; void f(int *A, int N) { +; for (int i = 0; i < N; i++) { +; int x = i + 3; +; for (int j = 0; j < N; j++) +; A[i] += A[x + j]; +; A[x] += x; +; } +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* %A, i32 %N) { +entry: + %tmp = sext i32 %N to i64 + br label %for.cond + +for.cond: ; preds = %for.inc.11, %entry + %indvars.iv2 = phi i64 [ %indvars.iv.next3, %for.inc.11 ], [ 0, %entry ] + %cmp = icmp slt i64 %indvars.iv2, %tmp + br i1 %cmp, label %for.body, label %for.end.13 + +for.body: ; preds = %for.cond + %tmp5 = add nuw nsw i64 %indvars.iv2, 3 + br label %for.cond.1 + +for.cond.1: ; preds = %for.inc, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.body ] + %lftr.wideiv = trunc i64 %indvars.iv to i32 + %exitcond = icmp ne i32 %lftr.wideiv, %N + br i1 %exitcond, label %for.body.3, label %for.end + +for.body.3: ; preds = %for.cond.1 + %tmp6 = add nuw nsw i64 %tmp5, %indvars.iv + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %tmp6 + %tmp7 = load i32, i32* %arrayidx, align 4 + %arrayidx6 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv2 + %tmp8 = load i32, i32* %arrayidx6, align 4 + %add7 = add nsw i32 %tmp8, %tmp7 + store i32 %add7, i32* %arrayidx6, align 4 + br label %for.inc + +for.inc: ; preds = %for.body.3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.cond.1 + +for.end: ; preds = %for.cond.1 + %arrayidx9 = getelementptr inbounds i32, i32* %A, i64 %tmp5 + %tmp9 = load i32, i32* %arrayidx9, align 4 + %tmp10 = trunc i64 %tmp5 to i32 + %add10 = add nsw i32 %tmp9, %tmp10 + store i32 %add10, i32* %arrayidx9, align 4 + br label %for.inc.11 + +for.inc.11: ; preds = %for.end + %indvars.iv.next3 = add nuw nsw i64 %indvars.iv2, 1 + br label %for.cond + +for.end.13: ; preds = %for.cond + ret void +} Index: test/ScopInfo/unneeded_scalar_dependences-3.ll =================================================================== --- /dev/null +++ test/ScopInfo/unneeded_scalar_dependences-3.ll @@ -0,0 +1,66 @@ +; RUN: opt %loadPolly -polly-scops -analyze < %s | FileCheck %s +; RUN: opt %loadPolly -polly-codegen -analyze < %s +; +; CHECK-NOT: Scalar: 1 +; +; void f(int *A, int N) { +; for (int i = 0; i < N; i++) { +; int x = i + 3; +; for (int j = 0; j < N; j++) +; A[i] += x + j * x; +; A[x] += x * x; +; } +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* %A, i32 %N) { +entry: + %tmp = sext i32 %N to i64 + br label %for.cond + +for.cond: ; preds = %for.inc.10, %entry + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc.10 ], [ 0, %entry ] + %cmp = icmp slt i64 %indvars.iv, %tmp + br i1 %cmp, label %for.body, label %for.end.12 + +for.body: ; preds = %for.cond + %tmp2 = add nuw nsw i64 %indvars.iv, 3 + br label %for.cond.1 + +for.cond.1: ; preds = %for.inc, %for.body + %j.0 = phi i32 [ 0, %for.body ], [ %inc, %for.inc ] + %exitcond = icmp ne i32 %j.0, %N + br i1 %exitcond, label %for.body.3, label %for.end + +for.body.3: ; preds = %for.cond.1 + %tmp3 = trunc i64 %tmp2 to i32 + %mul = mul nsw i32 %j.0, %tmp3 + %tmp4 = trunc i64 %tmp2 to i32 + %add4 = add nsw i32 %tmp4, %mul + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp5 = load i32, i32* %arrayidx, align 4 + %add5 = add nsw i32 %tmp5, %add4 + store i32 %add5, i32* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body.3 + %inc = add nuw nsw i32 %j.0, 1 + br label %for.cond.1 + +for.end: ; preds = %for.cond.1 + %tmp6 = trunc i64 %tmp2 to i32 + %mul6 = mul nsw i32 %tmp6, %tmp6 + %arrayidx8 = getelementptr inbounds i32, i32* %A, i64 %tmp2 + %tmp7 = load i32, i32* %arrayidx8, align 4 + %add9 = add nsw i32 %tmp7, %mul6 + store i32 %add9, i32* %arrayidx8, align 4 + br label %for.inc.10 + +for.inc.10: ; preds = %for.end + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.cond + +for.end.12: ; preds = %for.cond + ret void +} Index: test/ScopInfo/unneeded_scalar_dependences-4.ll =================================================================== --- /dev/null +++ test/ScopInfo/unneeded_scalar_dependences-4.ll @@ -0,0 +1,69 @@ +; RUN: opt %loadPolly -polly-scops -analyze < %s | FileCheck %s +; RUN: opt %loadPolly -polly-codegen -analyze < %s +; +; CHECK-NOT: MemRef_cond +; CHECK-NOT: Scalar: 1 +; +; void f(int *A, int N) { +; for (int i = 0; i < N; i++) { +; int x = i > 42 ? i - 3 : i; +; for (int j = 0; j < N; j++) +; A[i] += x + j * x; +; A[i] += x * x; +; } +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* %A, i32 %N) { +entry: + %tmp = sext i32 %N to i64 + br label %for.cond + +for.cond: ; preds = %for.inc.10, %entry + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc.10 ], [ 0, %entry ] + %i.0 = phi i32 [ 0, %entry ], [ %inc11, %for.inc.10 ] + %cmp = icmp slt i64 %indvars.iv, %tmp + br i1 %cmp, label %for.body, label %for.end.12 + +for.body: ; preds = %for.cond + %cmp1 = icmp sgt i64 %indvars.iv, 42 + %tmp2 = trunc i64 %indvars.iv to i32 + %sub = add nsw i32 %i.0, -3 + %cond = select i1 %cmp1, i32 %sub, i32 %tmp2 + br label %for.cond.2 + +for.cond.2: ; preds = %for.inc, %cond.end + %j.0 = phi i32 [ 0, %for.body ], [ %inc, %for.inc ] + %cmp3 = icmp slt i32 %j.0, %N + br i1 %cmp3, label %for.body.4, label %for.end + +for.body.4: ; preds = %for.cond.2 + %mul = mul nsw i32 %j.0, %cond + %add = add nsw i32 %cond, %mul + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp4 = load i32, i32* %arrayidx, align 4 + %add5 = add nsw i32 %tmp4, %add + store i32 %add5, i32* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body.4 + %inc = add nuw nsw i32 %j.0, 1 + br label %for.cond.2 + +for.end: ; preds = %for.cond.2 + %mul6 = mul nsw i32 %cond, %cond + %arrayidx8 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp5 = load i32, i32* %arrayidx8, align 4 + %add9 = add nsw i32 %tmp5, %mul6 + store i32 %add9, i32* %arrayidx8, align 4 + br label %for.inc.10 + +for.inc.10: ; preds = %for.end + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %inc11 = add nuw nsw i32 %i.0, 1 + br label %for.cond + +for.end.12: ; preds = %for.cond + ret void +} Index: test/ScopInfo/unneeded_scalar_dependences-5.ll =================================================================== --- /dev/null +++ test/ScopInfo/unneeded_scalar_dependences-5.ll @@ -0,0 +1,72 @@ +; RUN: opt %loadPolly -polly-scops -analyze < %s | FileCheck %s +; RUN: opt %loadPolly -polly-codegen -analyze < %s +; +; CHECK-NOT: MemRef_conv +; CHECK-NOT: Scalar: 1 +; +; void f(int *A, int N) { +; for (int i = 0; i < N; i++) { +; float x = i + 3.3; +; for (int j = 0; j < N; j++) +; A[i] += x + j * x; +; A[i] += x * x; +; } +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* %A, i32 %N) { +entry: + %tmp = sext i32 %N to i64 + br label %for.cond + +for.cond: ; preds = %for.inc.17, %entry + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc.17 ], [ 0, %entry ] + %cmp = icmp slt i64 %indvars.iv, %tmp + br i1 %cmp, label %for.body, label %for.end.19 + +for.body: ; preds = %for.cond + %tmp1 = trunc i64 %indvars.iv to i32 + %conv = sitofp i32 %tmp1 to double + %add = fadd double %conv, 3.300000e+00 + %conv1 = fptrunc double %add to float + br label %for.cond.2 + +for.cond.2: ; preds = %for.inc, %for.body + %j.0 = phi i32 [ 0, %for.body ], [ %inc, %for.inc ] + %exitcond = icmp ne i32 %j.0, %N + br i1 %exitcond, label %for.body.5, label %for.end + +for.body.5: ; preds = %for.cond.2 + %conv6 = sitofp i32 %j.0 to float + %mul = fmul float %conv6, %conv1 + %add7 = fadd float %conv1, %mul + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp2 = load i32, i32* %arrayidx, align 4 + %conv8 = sitofp i32 %tmp2 to float + %add9 = fadd float %conv8, %add7 + %conv10 = fptosi float %add9 to i32 + store i32 %conv10, i32* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body.5 + %inc = add nuw nsw i32 %j.0, 1 + br label %for.cond.2 + +for.end: ; preds = %for.cond.2 + %mul11 = fmul float %conv1, %conv1 + %arrayidx13 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp3 = load i32, i32* %arrayidx13, align 4 + %conv14 = sitofp i32 %tmp3 to float + %add15 = fadd float %conv14, %mul11 + %conv16 = fptosi float %add15 to i32 + store i32 %conv16, i32* %arrayidx13, align 4 + br label %for.inc.17 + +for.inc.17: ; preds = %for.end + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.cond + +for.end.19: ; preds = %for.cond + ret void +} Index: test/ScopInfo/unneeded_scalar_dependences-6.ll =================================================================== --- /dev/null +++ test/ScopInfo/unneeded_scalar_dependences-6.ll @@ -0,0 +1,49 @@ +; RUN: opt %loadPolly -polly-scops -analyze < %s | FileCheck %s +; RUN: opt %loadPolly -polly-codegen -analyze < %s +; +; void f(int *A, int N) { +; for (int i = 1; i < N; i++) { +; int preloaded = A[0]; +; /* split BB */ +; A[i] += preloaded; +; } +; } +; +; CHECK: Invariant Accesses: { +; CHECK: ReadAccess := [Reduction Type: NONE] [Scalar: 0] +; CHECK: [N] -> { Stmt_for_body[i0] -> MemRef_A[0] }; +; CHECK: Execution Context: [N] -> { : N >= 2 } +; CHECK: } +; +; CHECK-NOT: Scalar: 1 +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* %A, i32 %N) { +entry: + %tmp = sext i32 %N to i64 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 1, %entry ] + %cmp = icmp slt i64 %indvars.iv, %tmp + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %tmp1 = load i32, i32* %A, align 4 + br label %for.body.split + +for.body.split: + %arrayidx1 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp2 = load i32, i32* %arrayidx1, align 4 + %add = add nsw i32 %tmp2, %tmp1 + store i32 %add, i32* %arrayidx1, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} Index: test/ScopInfo/unneeded_scalar_dependences-7.ll =================================================================== --- /dev/null +++ test/ScopInfo/unneeded_scalar_dependences-7.ll @@ -0,0 +1,47 @@ +; RUN: opt %loadPolly -polly-scops -analyze < %s | FileCheck %s +; RUN: opt %loadPolly -polly-codegen -analyze < %s +; +; void f(int *A, int N) { +; for (int i = 0; i < N; i++) { +; int scevable_addr = 2 * i + N; +; /* split BB */ +; A[scevable_addr] += i; +; } +; } +; +; CHECK-NOT: Scalar: 1 +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* %A, i32 %N) { +entry: + %tmp = sext i32 %N to i64 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ] + %cmp = icmp slt i64 %indvars.iv, %tmp + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %tmp1 = trunc i64 %indvars.iv to i32 + %mul = shl nsw i32 %tmp1, 1 + %add = add nsw i32 %mul, %N + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %idxprom + br label %for.body.split + +for.body.split: + %tmp2 = load i32, i32* %arrayidx, align 4 + %tmp3 = trunc i64 %indvars.iv to i32 + %add1 = add nsw i32 %tmp2, %tmp3 + store i32 %add1, i32* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} Index: test/ScopInfo/unneeded_scalar_dependences-8.ll =================================================================== --- /dev/null +++ test/ScopInfo/unneeded_scalar_dependences-8.ll @@ -0,0 +1,51 @@ +; RUN: opt %loadPolly -polly-scops -analyze < %s | FileCheck %s +; RUN: opt %loadPolly -polly-codegen -analyze < %s +; +; void f(int *A, int N) { +; int pre_scevable_addr = 2 * N + (N - 1); +; for (int i = 0; i < N; i++) { +; int scevable_addr = 2 * i + pre_scevable_addr; +; /* split BB */ +; A[scevable_addr] += i; +; } +; } +; +; CHECK-NOT: Scalar: 1 +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* %A, i32 %N) { +entry: + %tmp = sext i32 %N to i64 + %m = mul nsw i32 %N, 2 + %s = sub nsw i32 %N, 1 + %pre_scevable_addr = add nsw i32 %m, %s + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ] + %cmp = icmp slt i64 %indvars.iv, %tmp + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %tmp1 = trunc i64 %indvars.iv to i32 + %mul = shl nsw i32 %tmp1, 1 + %add = add nsw i32 %mul, %pre_scevable_addr + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %idxprom + br label %for.body.split + +for.body.split: + %tmp2 = load i32, i32* %arrayidx, align 4 + %tmp3 = trunc i64 %indvars.iv to i32 + %add1 = add nsw i32 %tmp2, %tmp3 + store i32 %add1, i32* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} Index: test/ScopInfo/unneeded_scalar_dependences-9.ll =================================================================== --- /dev/null +++ test/ScopInfo/unneeded_scalar_dependences-9.ll @@ -0,0 +1,60 @@ +; RUN: opt %loadPolly -polly-scops -analyze < %s | FileCheck %s +; RUN: opt %loadPolly -polly-codegen -analyze < %s +; +; void f(int *A, int N) { +; if (N > 42) { +; int pre_scevable_addr = 2 * N + (N - 1); +; for (int i = 0; i < N; i++) { +; int scevable_addr = 2 * i + pre_scevable_addr; +; /* split BB */ +; A[scevable_addr] += i; +; } +; } +; } +; +; CHECK-NOT: Scalar: 1 +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @f(i32* %A, i32 %N) { +entry: + %tmp = sext i32 %N to i64 + %m = mul nsw i32 %N, 2 + br label %if.cond + +if.cond: + %s = sub nsw i32 %N, 1 + %c = icmp sgt i32 %N, 42 + br i1 %c, label %if.then, label %for.end + +if.then: + %pre_scevable_addr = add nsw i32 %m, %s + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %if.then ] + %cmp = icmp slt i64 %indvars.iv, %tmp + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %tmp1 = trunc i64 %indvars.iv to i32 + %mul = shl nsw i32 %tmp1, 1 + %add = add nsw i32 %mul, %pre_scevable_addr + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %idxprom + br label %for.body.split + +for.body.split: + %tmp2 = load i32, i32* %arrayidx, align 4 + %tmp3 = trunc i64 %indvars.iv to i32 + %add1 = add nsw i32 %tmp2, %tmp3 + store i32 %add1, i32* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +}