Index: llvm/include/llvm/InitializePasses.h =================================================================== --- llvm/include/llvm/InitializePasses.h +++ llvm/include/llvm/InitializePasses.h @@ -242,6 +242,7 @@ void initializeLoopInfoWrapperPassPass(PassRegistry&); void initializeLoopInstSimplifyLegacyPassPass(PassRegistry&); void initializeLoopInterchangePass(PassRegistry&); +void initializeLoopFlattenLegacyPassPass(PassRegistry&); void initializeLoopLoadEliminationPass(PassRegistry&); void initializeLoopPassPass(PassRegistry&); void initializeLoopPredicationLegacyPassPass(PassRegistry&); Index: llvm/include/llvm/LinkAllPasses.h =================================================================== --- llvm/include/llvm/LinkAllPasses.h +++ llvm/include/llvm/LinkAllPasses.h @@ -127,6 +127,7 @@ (void) llvm::createLazyValueInfoPass(); (void) llvm::createLoopExtractorPass(); (void) llvm::createLoopInterchangePass(); + (void) llvm::createLoopFlattenPass(); (void) llvm::createLoopPredicationPass(); (void) llvm::createLoopSimplifyPass(); (void) llvm::createLoopSimplifyCFGPass(); Index: llvm/include/llvm/Transforms/Scalar.h =================================================================== --- llvm/include/llvm/Transforms/Scalar.h +++ llvm/include/llvm/Transforms/Scalar.h @@ -149,6 +149,12 @@ // Pass *createLoopInterchangePass(); +//===----------------------------------------------------------------------===// +// +// LoopFlatten - This pass flattens nested loops into a single loop. +// +Pass *createLoopFlattenPass(); + //===----------------------------------------------------------------------===// // // LoopStrengthReduce - This pass is strength reduces GEP instructions that use Index: llvm/include/llvm/Transforms/Scalar/LoopFlatten.h =================================================================== --- /dev/null +++ llvm/include/llvm/Transforms/Scalar/LoopFlatten.h @@ -0,0 +1,33 @@ +//===- LoopFlatten.h - Loop Flatten ---------------- -----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file provides the interface for the Loop Flatten Pass. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SCALAR_LOOPFLATTEN_H +#define LLVM_TRANSFORMS_SCALAR_LOOPFLATTEN_H + +#include "llvm/Analysis/LoopAnalysisManager.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Transforms/Scalar/LoopPassManager.h" + +namespace llvm { + +class LoopFlattenPass : public PassInfoMixin { +public: + LoopFlattenPass() = default; + + PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, LPMUpdater &U); +}; + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_SCALAR_LOOPFLATTEN_H Index: llvm/lib/Passes/PassBuilder.cpp =================================================================== --- llvm/lib/Passes/PassBuilder.cpp +++ llvm/lib/Passes/PassBuilder.cpp @@ -149,6 +149,7 @@ #include "llvm/Transforms/Scalar/LoopDataPrefetch.h" #include "llvm/Transforms/Scalar/LoopDeletion.h" #include "llvm/Transforms/Scalar/LoopDistribute.h" +#include "llvm/Transforms/Scalar/LoopFlatten.h" #include "llvm/Transforms/Scalar/LoopFuse.h" #include "llvm/Transforms/Scalar/LoopIdiomRecognize.h" #include "llvm/Transforms/Scalar/LoopInstSimplify.h" @@ -249,6 +250,10 @@ "enable-npm-unroll-and-jam", cl::init(false), cl::Hidden, cl::desc("Enable the Unroll and Jam pass for the new PM (default = off)")); +static cl::opt EnableLoopFlatten( + "enable-npm-loop-flatten", cl::init(false), cl::Hidden, + cl::desc("Enable the Loop flattening pass for the new PM (default = off)")); + static cl::opt EnableSyntheticCounts( "enable-npm-synthetic-counts", cl::init(false), cl::Hidden, cl::ZeroOrMore, cl::desc("Run synthetic function entry count generation " @@ -509,6 +514,8 @@ C(LPM2, Level); LPM2.addPass(LoopDeletionPass()); + if (EnableLoopFlatten) + LPM2.addPass(LoopFlattenPass()); // Do not enable unrolling in PreLinkThinLTO phase during sample PGO // because it changes IR to makes profile annotation in back compile // inaccurate. The normal unroller doesn't pay attention to forced full unroll Index: llvm/lib/Passes/PassRegistry.def =================================================================== --- llvm/lib/Passes/PassRegistry.def +++ llvm/lib/Passes/PassRegistry.def @@ -358,6 +358,7 @@ LOOP_PASS("no-op-loop", NoOpLoopPass()) LOOP_PASS("print", PrintLoopPass(dbgs())) LOOP_PASS("loop-deletion", LoopDeletionPass()) +LOOP_PASS("loop-flatten", LoopFlattenPass()) LOOP_PASS("loop-simplifycfg", LoopSimplifyCFGPass()) LOOP_PASS("loop-reduce", LoopStrengthReducePass()) LOOP_PASS("indvars", IndVarSimplifyPass()) Index: llvm/lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -92,6 +92,10 @@ cl::init(false), cl::Hidden, cl::desc("Enable Unroll And Jam Pass")); +static cl::opt EnableLoopFlatten("enable-loop-flatten", cl::init(false), + cl::Hidden, + cl::desc("Enable the LoopFlatten Pass")); + static cl::opt EnablePrepareForThinLTO("prepare-for-thinlto", cl::init(false), cl::Hidden, cl::desc("Enable preparation for ThinLTO.")); @@ -444,6 +448,10 @@ if (EnableLoopInterchange) MPM.add(createLoopInterchangePass()); // Interchange loops + if (EnableLoopFlatten) { + MPM.add(createLoopFlattenPass()); // Flatten loops + MPM.add(createLoopSimplifyCFGPass()); + } // Unroll small loops MPM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops, @@ -1035,6 +1043,8 @@ PM.add(createLoopDeletionPass()); if (EnableLoopInterchange) PM.add(createLoopInterchangePass()); + if (EnableLoopFlatten) + PM.add(createLoopFlattenPass()); // Unroll small loops PM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops, Index: llvm/lib/Transforms/Scalar/CMakeLists.txt =================================================================== --- llvm/lib/Transforms/Scalar/CMakeLists.txt +++ llvm/lib/Transforms/Scalar/CMakeLists.txt @@ -32,6 +32,7 @@ LoopIdiomRecognize.cpp LoopInstSimplify.cpp LoopInterchange.cpp + LoopFlatten.cpp LoopLoadElimination.cpp LoopPassManager.cpp LoopPredication.cpp Index: llvm/lib/Transforms/Scalar/LoopFlatten.cpp =================================================================== --- /dev/null +++ llvm/lib/Transforms/Scalar/LoopFlatten.cpp @@ -0,0 +1,604 @@ +//===- LoopFlatten.cpp - Loop flattening pass------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass flattens pairs nested loops into a single loop. +// +// The intention is to optimise loop nests like this, which together access an +// array linearly: +// for (int i = 0; i < N; ++i) +// for (int j = 0; j < M; ++j) +// f(A[i*M+j]); +// into one loop: +// for (int i = 0; i < (N*M); ++i) +// f(A[i]); +// +// It can also flatten loops where the induction variables are not used in the +// loop. This is only worth doing if the induction variables are only used in an +// expression like i*M+j. If they had any other uses, we would have to insert a +// div/mod to reconstruct the original values, so this wouldn't be profitable. +// +// We also need to prove that N*M will not overflow. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/LoopFlatten.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Verifier.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/LoopUtils.h" + +#define DEBUG_TYPE "loop-flatten" + +using namespace llvm; +using namespace llvm::PatternMatch; + +static cl::opt RepeatedInstructionThreshold( + "loop-flatten-cost-threshold", cl::Hidden, cl::init(2), + cl::desc("Limit on the cost of instructions that can be repeated due to " + "loop flattening")); + +static cl::opt + AssumeNoOverflow("loop-flatten-assume-no-overflow", cl::Hidden, + cl::init(false), + cl::desc("Assume that the product of the two iteration " + "limits will never overflow")); + +// Finds the induction variable, increment and limit for a simple loop that we +// can flatten. +static bool findLoopComponents( + Loop *L, SmallPtrSetImpl &IterationInstructions, + PHINode *&InductionPHI, Value *&Limit, BinaryOperator *&Increment, + BranchInst *&BackBranch, ScalarEvolution *SE) { + LLVM_DEBUG(dbgs() << "Finding components of loop: " << L->getName() << "\n"); + + if (!L->isLoopSimplifyForm()) { + LLVM_DEBUG(dbgs() << "Loop is not in normal form\n"); + return false; + } + + // There must be exactly one exiting block, and it must be the same at the + // latch. + BasicBlock *Latch = L->getLoopLatch(); + if (L->getExitingBlock() != Latch) { + LLVM_DEBUG(dbgs() << "Exiting and latch block are different\n"); + return false; + } + // Latch block must end in a conditional branch. + BackBranch = dyn_cast(Latch->getTerminator()); + if (!BackBranch || !BackBranch->isConditional()) { + LLVM_DEBUG(dbgs() << "Could not find back-branch\n"); + return false; + } + IterationInstructions.insert(BackBranch); + LLVM_DEBUG(dbgs() << "Found back branch: "; BackBranch->dump()); + bool ContinueOnTrue = L->contains(BackBranch->getSuccessor(0)); + + // Find the induction PHI. If there is no induction PHI, we can't do the + // transformation. TODO: could other variables trigger this? Do we have to + // search for the best one? + InductionPHI = nullptr; + for (PHINode &PHI : L->getHeader()->phis()) { + InductionDescriptor ID; + if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID)) { + InductionPHI = &PHI; + LLVM_DEBUG(dbgs() << "Found induction PHI: "; InductionPHI->dump()); + break; + } + } + if (!InductionPHI) { + LLVM_DEBUG(dbgs() << "Could not find induction PHI\n"); + return false; + } + + auto IsValidPredicate = [&](ICmpInst::Predicate Pred) { + if (ContinueOnTrue) + return Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_ULT; + else + return Pred == CmpInst::ICMP_EQ; + }; + + // Find Compare and make sure it is valid + ICmpInst *Compare = dyn_cast(BackBranch->getCondition()); + if (!Compare || !IsValidPredicate(Compare->getUnsignedPredicate()) || + Compare->hasNUsesOrMore(2)) { + LLVM_DEBUG(dbgs() << "Could not find valid comparison\n"); + return false; + } + IterationInstructions.insert(Compare); + LLVM_DEBUG(dbgs() << "Found comparison: "; Compare->dump()); + + // Find increment and limit from the compare + Increment = nullptr; + if (match(Compare->getOperand(0), + m_c_Add(m_Specific(InductionPHI), m_ConstantInt<1>()))) { + Increment = dyn_cast(Compare->getOperand(0)); + Limit = Compare->getOperand(1); + } else if (Compare->getUnsignedPredicate() == CmpInst::ICMP_NE && + match(Compare->getOperand(1), + m_c_Add(m_Specific(InductionPHI), m_ConstantInt<1>()))) { + Increment = dyn_cast(Compare->getOperand(1)); + Limit = Compare->getOperand(0); + } + if (!Increment || Increment->hasNUsesOrMore(3)) { + LLVM_DEBUG(dbgs() << "Cound not find valid increment\n"); + return false; + } + IterationInstructions.insert(Increment); + LLVM_DEBUG(dbgs() << "Found increment: "; Increment->dump()); + LLVM_DEBUG(dbgs() << "Found limit: "; Limit->dump()); + + assert(InductionPHI->getNumIncomingValues() == 2); + assert(InductionPHI->getIncomingValueForBlock(Latch) == Increment && + "PHI value is not increment inst"); + + auto *CI = dyn_cast( + InductionPHI->getIncomingValueForBlock(L->getLoopPreheader())); + if (!CI || !CI->isZero()) { + LLVM_DEBUG(dbgs() << "PHI value is not zero: "; CI->dump()); + return false; + } + + LLVM_DEBUG(dbgs() << "Successfully found all loop components\n"); + return true; +} + +static bool checkPHIs(Loop *OuterLoop, Loop *InnerLoop, + SmallPtrSetImpl &InnerPHIsToTransform, + PHINode *InnerInductionPHI, PHINode *OuterInductionPHI, + TargetTransformInfo *TTI) { + // All PHIs in the inner and outer headers must either be: + // - The induction PHI, which we are going to rewrite as one induction in + // the new loop. This is already checked by findLoopComponents. + // - An outer header PHI with all incoming values from outside the loop. + // LoopSimplify guarantees we have a pre-header, so we don't need to + // worry about that here. + // - Pairs of PHIs in the inner and outer headers, which implement a + // loop-carried dependency that will still be valid in the new loop. To + // be valid, this variable must be modified only in the inner loop. + + // The set of PHI nodes in the outer loop header that we know will still be + // valid after the transformation. These will not need to be modified (with + // the exception of the induction variable), but we do need to check that + // there are no unsafe PHI nodes. + SmallPtrSet SafeOuterPHIs; + SafeOuterPHIs.insert(OuterInductionPHI); + + // Check that all PHI nodes in the inner loop header match one of the valid + // patterns. + for (PHINode &InnerPHI : InnerLoop->getHeader()->phis()) { + // The induction PHIs break these rules, and that's OK because we treat + // them specially when doing the transformation. + if (&InnerPHI == InnerInductionPHI) + continue; + + // Each inner loop PHI node must have two incoming values/blocks - one + // from the pre-header, and one from the latch. + assert(InnerPHI.getNumIncomingValues() == 2); + Value *PreHeaderValue = + InnerPHI.getIncomingValueForBlock(InnerLoop->getLoopPreheader()); + Value *LatchValue = + InnerPHI.getIncomingValueForBlock(InnerLoop->getLoopLatch()); + + // The incoming value from the outer loop must be the PHI node in the + // outer loop header, with no modifications made in the top of the outer + // loop. + PHINode *OuterPHI = dyn_cast(PreHeaderValue); + if (!OuterPHI || OuterPHI->getParent() != OuterLoop->getHeader()) { + LLVM_DEBUG(dbgs() << "value modified in top of outer loop\n"); + return false; + } + + // The other incoming value must come from the inner loop, without any + // modifications in the tail end of the outer loop. We are in LCSSA form, + // so this will actually be a PHI in the inner loop's exit block, which + // only uses values from inside the inner loop. + PHINode *LCSSAPHI = dyn_cast( + OuterPHI->getIncomingValueForBlock(OuterLoop->getLoopLatch())); + if (!LCSSAPHI) { + LLVM_DEBUG(dbgs() << "could not find LCSSA PHI\n"); + return false; + } + + // The value used by the LCSSA PHI must be the same one that the inner + // loop's PHI uses. + if (LCSSAPHI->hasConstantValue() != LatchValue) { + LLVM_DEBUG( + dbgs() << "LCSSA PHI incoming value does not match latch value\n"); + return false; + } + + LLVM_DEBUG(dbgs() << "PHI pair is safe:\n"); + LLVM_DEBUG(dbgs() << " Inner: "; InnerPHI.dump()); + LLVM_DEBUG(dbgs() << " Outer: "; OuterPHI->dump()); + SafeOuterPHIs.insert(OuterPHI); + InnerPHIsToTransform.insert(&InnerPHI); + } + + for (PHINode &OuterPHI : OuterLoop->getHeader()->phis()) { + if (!SafeOuterPHIs.count(&OuterPHI)) { + LLVM_DEBUG(dbgs() << "found unsafe PHI in outer loop: "; OuterPHI.dump()); + return false; + } + } + + return true; +} + +static bool +checkOuterLoopInsts(Loop *OuterLoop, Loop *InnerLoop, + SmallPtrSetImpl &IterationInstructions, + Value *InnerLimit, PHINode *OuterPHI, + TargetTransformInfo *TTI) { + // Check for instructions in the outer but not inner loop. If any of these + // have side-effects then this transformation is not legal, and if there is + // a significant amount of code here which can't be optimised out that it's + // not profitable (as these instructions would get executed for each + // iteration of the inner loop). + unsigned RepeatedInstrCost = 0; + for (auto *B : OuterLoop->getBlocks()) { + if (InnerLoop->contains(B)) + continue; + + for (auto &I : *B) { + if (!isa(&I) && !I.isTerminator() && + !isSafeToSpeculativelyExecute(&I)) { + LLVM_DEBUG(dbgs() << "Cannot flatten because instruction may have " + "side effects: "; + I.dump()); + return false; + } + // The execution count of the outer loop's iteration instructions + // (increment, compare and branch) will be increased, but the + // equivalent instructions will be removed from the inner loop, so + // they make a net difference of zero. + if (IterationInstructions.count(&I)) + continue; + // The uncoditional branch to the inner loop's header will turn into + // a fall-through, so adds no cost. + BranchInst *Br = dyn_cast(&I); + if (Br && Br->isUnconditional() && + Br->getSuccessor(0) == InnerLoop->getHeader()) + continue; + // Multiplies of the outer iteration variable and inner iteration + // count will be optimised out. + if (match(&I, m_c_Mul(m_Specific(OuterPHI), m_Specific(InnerLimit)))) + continue; + int Cost = TTI->getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency); + LLVM_DEBUG(dbgs() << "Cost " << Cost << ": "; I.dump()); + RepeatedInstrCost += Cost; + } + } + + LLVM_DEBUG(dbgs() << "Cost of instructions that will be repeated: " + << RepeatedInstrCost << "\n"); + // Bail out if flattening the loops would cause instructions in the outer + // loop but not in the inner loop to be executed extra times. + if (RepeatedInstrCost > RepeatedInstructionThreshold) + return false; + + return true; +} + +static bool checkIVUsers(PHINode *InnerPHI, PHINode *OuterPHI, + BinaryOperator *InnerIncrement, + BinaryOperator *OuterIncrement, Value *InnerLimit, + SmallPtrSetImpl &LinearIVUses) { + // We require all uses of both induction variables to match this pattern: + // + // (OuterPHI * InnerLimit) + InnerPHI + // + // Any uses of the induction variables not matching that pattern would + // require a div/mod to reconstruct in the flattened loop, so the + // transformation wouldn't be profitable. + + // Check that all uses of the inner loop's induction variable match the + // expected pattern, recording the uses of the outer IV. + SmallPtrSet ValidOuterPHIUses; + for (User *U : InnerPHI->users()) { + if (U == InnerIncrement) + continue; + + LLVM_DEBUG(dbgs() << "Found use of inner induction variable: "; U->dump()); + + Value *MatchedMul, *MatchedItCount; + if (match(U, m_c_Add(m_Specific(InnerPHI), m_Value(MatchedMul))) && + match(MatchedMul, + m_c_Mul(m_Specific(OuterPHI), m_Value(MatchedItCount))) && + MatchedItCount == InnerLimit) { + LLVM_DEBUG(dbgs() << "Use is optimisable\n"); + ValidOuterPHIUses.insert(MatchedMul); + LinearIVUses.insert(U); + } else { + LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n"); + return false; + } + } + + // Check that there are no uses of the outer IV other than the ones found + // as part of the pattern above. + for (User *U : OuterPHI->users()) { + if (U == OuterIncrement) + continue; + + LLVM_DEBUG(dbgs() << "Found use of outer induction variable: "; U->dump()); + + if (!ValidOuterPHIUses.count(U)) { + LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n"); + return false; + } else { + LLVM_DEBUG(dbgs() << "Use is optimisable\n"); + } + } + + LLVM_DEBUG(dbgs() << "Found " << LinearIVUses.size() + << " value(s) that can be replaced:\n"; + for (Value *V : LinearIVUses) { + dbgs() << " "; + V->dump(); + }); + + return true; +} + +// Return an OverflowResult dependant on if overflow of the multiplication of +// InnerLimit and OuterLimit can be assumed not to happen. +static OverflowResult checkOverflow(Loop *OuterLoop, Value *InnerLimit, + Value *OuterLimit, + SmallPtrSetImpl &LinearIVUses, + DominatorTree *DT, AssumptionCache *AC) { + Function *F = OuterLoop->getHeader()->getParent(); + const DataLayout &DL = F->getParent()->getDataLayout(); + + // For debugging/testing. + if (AssumeNoOverflow) + return OverflowResult::NeverOverflows; + + // Check if the multiply could not overflow due to known ranges of the + // input values. + OverflowResult OR = computeOverflowForUnsignedMul( + InnerLimit, OuterLimit, DL, AC, + OuterLoop->getLoopPreheader()->getTerminator(), DT); + if (OR != OverflowResult::MayOverflow) + return OR; + + for (Value *V : LinearIVUses) { + for (Value *U : V->users()) { + if (auto *GEP = dyn_cast(U)) { + // The IV is used as the operand of a GEP, and the IV is at least as + // wide as the address space of the GEP. In this case, the GEP would + // wrap around the address space before the IV increment wraps, which + // would be UB. + if (GEP->isInBounds() && + V->getType()->getIntegerBitWidth() >= + DL.getPointerTypeSizeInBits(GEP->getType())) { + LLVM_DEBUG( + dbgs() << "use of linear IV would be UB if overflow occurred: "; + GEP->dump()); + return OverflowResult::NeverOverflows; + } + } + } + } + + return OverflowResult::MayOverflow; +} + +static bool FlattenLoopPair(Loop *OuterLoop, Loop *InnerLoop, DominatorTree *DT, + LoopInfo *LI, ScalarEvolution *SE, + AssumptionCache *AC, TargetTransformInfo *TTI, + std::function markLoopAsDeleted) { + Function *F = OuterLoop->getHeader()->getParent(); + + LLVM_DEBUG(dbgs() << "Running on outer loop " + << OuterLoop->getHeader()->getName() << " and inner loop " + << InnerLoop->getHeader()->getName() << " in " + << F->getName() << "\n"); + + SmallPtrSet IterationInstructions; + + PHINode *InnerInductionPHI, *OuterInductionPHI; + Value *InnerLimit, *OuterLimit; + BinaryOperator *InnerIncrement, *OuterIncrement; + BranchInst *InnerBranch, *OuterBranch; + + if (!findLoopComponents(InnerLoop, IterationInstructions, InnerInductionPHI, + InnerLimit, InnerIncrement, InnerBranch, SE)) + return false; + if (!findLoopComponents(OuterLoop, IterationInstructions, OuterInductionPHI, + OuterLimit, OuterIncrement, OuterBranch, SE)) + return false; + + // Both of the loop limit values must be invariant in the outer loop + // (non-instructions are all inherently invariant). + if (!OuterLoop->isLoopInvariant(InnerLimit)) { + LLVM_DEBUG(dbgs() << "inner loop limit not invariant\n"); + return false; + } + if (!OuterLoop->isLoopInvariant(OuterLimit)) { + LLVM_DEBUG(dbgs() << "outer loop limit not invariant\n"); + return false; + } + + SmallPtrSet InnerPHIsToTransform; + if (!checkPHIs(OuterLoop, InnerLoop, InnerPHIsToTransform, InnerInductionPHI, + OuterInductionPHI, TTI)) + return false; + + // FIXME: it should be possible to handle different types correctly. + if (InnerInductionPHI->getType() != OuterInductionPHI->getType()) + return false; + + if (!checkOuterLoopInsts(OuterLoop, InnerLoop, IterationInstructions, + InnerLimit, OuterInductionPHI, TTI)) + return false; + + // Find the values in the loop that can be replaced with the linearized + // induction variable, and check that there are no other uses of the inner + // or outer induction variable. If there were, we could still do this + // transformation, but we'd have to insert a div/mod to calculate the + // original IVs, so it wouldn't be profitable. + SmallPtrSet LinearIVUses; + if (!checkIVUsers(InnerInductionPHI, OuterInductionPHI, InnerIncrement, + OuterIncrement, InnerLimit, LinearIVUses)) + return false; + + // Check if the new iteration variable might overflow. In this case, we + // need to version the loop, and select the original version at runtime if + // the iteration space is too large. + // TODO: We currently don't version the loop. + // TODO: it might be worth using a wider iteration variable rather than + // versioning the loop, if a wide enough type is legal. + bool MustVersionLoop = true; + OverflowResult OR = + checkOverflow(OuterLoop, InnerLimit, OuterLimit, LinearIVUses, DT, AC); + if (OR == OverflowResult::AlwaysOverflowsHigh || OR == OverflowResult::AlwaysOverflowsLow) { + LLVM_DEBUG(dbgs() << "Multiply would always overflow, so not profitable\n"); + return false; + } else if (OR == OverflowResult::MayOverflow) { + LLVM_DEBUG(dbgs() << "Multiply might overflow, not flattening\n"); + } else { + LLVM_DEBUG(dbgs() << "Multiply cannot overflow, modifying loop in-place\n"); + MustVersionLoop = false; + } + + // We cannot safely flatten the loop. Exit now. + if (MustVersionLoop) + return false; + + // Do the actual transformation. + LLVM_DEBUG(dbgs() << "Checks all passed, doing the transformation\n"); + + { + using namespace ore; + OptimizationRemark Remark(DEBUG_TYPE, "Flattened", InnerLoop->getStartLoc(), + InnerLoop->getHeader()); + OptimizationRemarkEmitter ORE(F); + Remark << "Flattened into outer loop"; + ORE.emit(Remark); + } + + Value *NewTripCount = + BinaryOperator::CreateMul(InnerLimit, OuterLimit, "flatten.tripcount", + OuterLoop->getLoopPreheader()->getTerminator()); + LLVM_DEBUG(dbgs() << "Created new trip count in preheader: "; + NewTripCount->dump()); + + // Fix up PHI nodes that take values from the inner loop back-edge, which + // we are about to remove. + InnerInductionPHI->removeIncomingValue(InnerLoop->getLoopLatch()); + for (PHINode *PHI : InnerPHIsToTransform) + PHI->removeIncomingValue(InnerLoop->getLoopLatch()); + + // Modify the trip count of the outer loop to be the product of the two + // trip counts. + cast(OuterBranch->getCondition())->setOperand(1, NewTripCount); + + // Replace the inner loop backedge with an unconditional branch to the exit. + BasicBlock *InnerExitBlock = InnerLoop->getExitBlock(); + BasicBlock *InnerExitingBlock = InnerLoop->getExitingBlock(); + InnerExitingBlock->getTerminator()->eraseFromParent(); + BranchInst::Create(InnerExitBlock, InnerExitingBlock); + DT->deleteEdge(InnerExitingBlock, InnerLoop->getHeader()); + + // Replace all uses of the polynomial calculated from the two induction + // variables with the one new one. + for (Value *V : LinearIVUses) + V->replaceAllUsesWith(OuterInductionPHI); + + // Tell LoopInfo, SCEV and the pass manager that the inner loop has been + // deleted, and any information that have about the outer loop invalidated. + markLoopAsDeleted(InnerLoop); + SE->forgetLoop(OuterLoop); + SE->forgetLoop(InnerLoop); + LI->erase(InnerLoop); + + return true; +} + +PreservedAnalyses LoopFlattenPass::run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, + LPMUpdater &Updater) { + if (L.getSubLoops().size() != 1) + return PreservedAnalyses::all(); + + Loop *InnerLoop = *L.begin(); + std::string LoopName(InnerLoop->getName()); + if (!FlattenLoopPair( + &L, InnerLoop, &AR.DT, &AR.LI, &AR.SE, &AR.AC, &AR.TTI, + [&](Loop *L) { Updater.markLoopAsDeleted(*L, LoopName); })) + return PreservedAnalyses::all(); + return getLoopPassPreservedAnalyses(); +} + +namespace { +class LoopFlattenLegacyPass : public LoopPass { +public: + static char ID; // Pass ID, replacement for typeid + LoopFlattenLegacyPass() : LoopPass(ID) { + initializeLoopFlattenLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + // Possibly flatten loop L into it's child + bool runOnLoop(Loop *L, LPPassManager &) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + getLoopAnalysisUsage(AU); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + } +}; +} // namespace + +char LoopFlattenLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(LoopFlattenLegacyPass, "loop-flatten", "Flattens loops", + false, false) +INITIALIZE_PASS_DEPENDENCY(LoopPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_END(LoopFlattenLegacyPass, "loop-flatten", "Flattens loops", + false, false) + +Pass *llvm::createLoopFlattenPass() { return new LoopFlattenLegacyPass(); } + +bool LoopFlattenLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) { + if (skipLoop(L)) + return false; + + if (L->getSubLoops().size() != 1) + return false; + + ScalarEvolution *SE = &getAnalysis().getSE(); + LoopInfo *LI = &getAnalysis().getLoopInfo(); + auto *DTWP = getAnalysisIfAvailable(); + DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; + auto &TTIP = getAnalysis(); + TargetTransformInfo *TTI = &TTIP.getTTI(*L->getHeader()->getParent()); + AssumptionCache *AC = + &getAnalysis().getAssumptionCache( + *L->getHeader()->getParent()); + + Loop *InnerLoop = *L->begin(); + return FlattenLoopPair(L, InnerLoop, DT, LI, SE, AC, TTI, + [&](Loop *L) { LPM.markLoopAsDeleted(*L); }); +} Index: llvm/lib/Transforms/Scalar/Scalar.cpp =================================================================== --- llvm/lib/Transforms/Scalar/Scalar.cpp +++ llvm/lib/Transforms/Scalar/Scalar.cpp @@ -67,6 +67,7 @@ initializeLoopAccessLegacyAnalysisPass(Registry); initializeLoopInstSimplifyLegacyPassPass(Registry); initializeLoopInterchangePass(Registry); + initializeLoopFlattenLegacyPassPass(Registry); initializeLoopPredicationLegacyPassPass(Registry); initializeLoopRotateLegacyPassPass(Registry); initializeLoopStrengthReducePass(Registry); @@ -186,6 +187,10 @@ unwrap(PM)->add(createLoopDeletionPass()); } +void LLVMAddLoopFlattenPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLoopFlattenPass()); +} + void LLVMAddLoopIdiomPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createLoopIdiomPass()); } Index: llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll @@ -0,0 +1,395 @@ +; RUN: opt < %s -S -loop-flatten -debug-only=loop-flatten 2>&1 | FileCheck %s +; REQUIRES: asserts + +; Every function in this file has a reason that it can't be transformed. + +; CHECK-NOT: Checks all passed, doing the transformation + +; Outer loop does not start at zero +define void @test_1(i32 %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) { +entry: + %cmp25 = icmp sgt i32 %N, 0 + br i1 %cmp25, label %for.body4.lr.ph, label %for.cond.cleanup + +for.body4.lr.ph: + %i.026 = phi i32 [ %inc10, %for.cond.cleanup3 ], [ 1, %entry ] + %mul = mul nsw i32 %i.026, %N + br label %for.body4 + +for.body4: + %j.024 = phi i32 [ 0, %for.body4.lr.ph ], [ %inc, %for.body4 ] + %add = add nsw i32 %j.024, %mul + %arrayidx = getelementptr inbounds i32, i32* %A, i32 %add + %0 = load i32, i32* %arrayidx, align 4 + %mul5 = mul nsw i32 %0, %scale + %arrayidx8 = getelementptr inbounds i32, i32* %C, i32 %add + store i32 %mul5, i32* %arrayidx8, align 4 + %inc = add nuw nsw i32 %j.024, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.cond.cleanup3, label %for.body4 + +for.cond.cleanup3: + %inc10 = add nuw nsw i32 %i.026, 1 + %exitcond27 = icmp eq i32 %inc10, %N + br i1 %exitcond27, label %for.cond.cleanup, label %for.body4.lr.ph + +for.cond.cleanup: + ret void +} + +; Inner loop does not start at zero +define void @test_2(i32 %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) { +entry: + %cmp25 = icmp sgt i32 %N, 0 + br i1 %cmp25, label %for.body4.lr.ph, label %for.cond.cleanup + +for.body4.lr.ph: + %i.026 = phi i32 [ %inc10, %for.cond.cleanup3 ], [ 0, %entry ] + %mul = mul nsw i32 %i.026, %N + br label %for.body4 + +for.body4: + %j.024 = phi i32 [ 1, %for.body4.lr.ph ], [ %inc, %for.body4 ] + %add = add nsw i32 %j.024, %mul + %arrayidx = getelementptr inbounds i32, i32* %A, i32 %add + %0 = load i32, i32* %arrayidx, align 4 + %mul5 = mul nsw i32 %0, %scale + %arrayidx8 = getelementptr inbounds i32, i32* %C, i32 %add + store i32 %mul5, i32* %arrayidx8, align 4 + %inc = add nuw nsw i32 %j.024, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.cond.cleanup3, label %for.body4 + +for.cond.cleanup3: + %inc10 = add nuw nsw i32 %i.026, 1 + %exitcond27 = icmp eq i32 %inc10, %N + br i1 %exitcond27, label %for.cond.cleanup, label %for.body4.lr.ph + +for.cond.cleanup: + ret void +} + +; Outer IV used directly +define hidden void @test_3(i16 zeroext %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) { +entry: + %conv = zext i16 %N to i32 + %cmp25 = icmp eq i16 %N, 0 + br i1 %cmp25, label %for.cond.cleanup, label %for.body.lr.ph.split.us + +for.body.lr.ph.split.us: ; preds = %entry + br label %for.body.us + +for.body.us: ; preds = %for.cond2.for.cond.cleanup6_crit_edge.us, %for.body.lr.ph.split.us + %i.026.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc12.us, %for.cond2.for.cond.cleanup6_crit_edge.us ] + %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %i.026.us + %mul9.us = mul nuw nsw i32 %i.026.us, %conv + br label %for.body7.us + +for.body7.us: ; preds = %for.body.us, %for.body7.us + %j.024.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %for.body7.us ] + %0 = load i32, i32* %arrayidx.us, align 4 + %mul.us = mul nsw i32 %0, %scale + %add.us = add nuw nsw i32 %j.024.us, %mul9.us + %arrayidx10.us = getelementptr inbounds i32, i32* %C, i32 %add.us + store i32 %mul.us, i32* %arrayidx10.us, align 4 + %inc.us = add nuw nsw i32 %j.024.us, 1 + %exitcond = icmp ne i32 %inc.us, %conv + br i1 %exitcond, label %for.body7.us, label %for.cond2.for.cond.cleanup6_crit_edge.us + +for.cond2.for.cond.cleanup6_crit_edge.us: ; preds = %for.body7.us + %inc12.us = add nuw nsw i32 %i.026.us, 1 + %exitcond27 = icmp ne i32 %inc12.us, %conv + br i1 %exitcond27, label %for.body.us, label %for.cond.cleanup.loopexit + +for.cond.cleanup.loopexit: ; preds = %for.cond2.for.cond.cleanup6_crit_edge.us + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void +} + +; Inner IV used directly +define hidden void @test_4(i16 zeroext %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) { +entry: + %conv = zext i16 %N to i32 + %cmp25 = icmp eq i16 %N, 0 + br i1 %cmp25, label %for.cond.cleanup, label %for.body.lr.ph.split.us + +for.body.lr.ph.split.us: ; preds = %entry + br label %for.body.us + +for.body.us: ; preds = %for.cond2.for.cond.cleanup6_crit_edge.us, %for.body.lr.ph.split.us + %i.026.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc12.us, %for.cond2.for.cond.cleanup6_crit_edge.us ] + %mul9.us = mul nuw nsw i32 %i.026.us, %conv + br label %for.body7.us + +for.body7.us: ; preds = %for.body.us, %for.body7.us + %j.024.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %for.body7.us ] + %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %j.024.us + %0 = load i32, i32* %arrayidx.us, align 4 + %mul.us = mul nsw i32 %0, %scale + %add.us = add nuw nsw i32 %j.024.us, %mul9.us + %arrayidx10.us = getelementptr inbounds i32, i32* %C, i32 %add.us + store i32 %mul.us, i32* %arrayidx10.us, align 4 + %inc.us = add nuw nsw i32 %j.024.us, 1 + %exitcond = icmp ne i32 %inc.us, %conv + br i1 %exitcond, label %for.body7.us, label %for.cond2.for.cond.cleanup6_crit_edge.us + +for.cond2.for.cond.cleanup6_crit_edge.us: ; preds = %for.body7.us + %inc12.us = add nuw nsw i32 %i.026.us, 1 + %exitcond27 = icmp ne i32 %inc12.us, %conv + br i1 %exitcond27, label %for.body.us, label %for.cond.cleanup.loopexit + +for.cond.cleanup.loopexit: ; preds = %for.cond2.for.cond.cleanup6_crit_edge.us + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void +} + +; Inner iteration count not invariant in outer loop +declare i32 @get_int() readonly +define void @test_5(i16 zeroext %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) { +entry: + %conv = zext i16 %N to i32 + %cmp27 = icmp eq i16 %N, 0 + br i1 %cmp27, label %for.cond.cleanup, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.cond.cleanup5 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.lr.ph, %for.cond.cleanup5 + %i.028 = phi i32 [ 0, %for.body.lr.ph ], [ %inc12, %for.cond.cleanup5 ] + %call = tail call i32 @get_int() + %cmp325 = icmp sgt i32 %call, 0 + br i1 %cmp325, label %for.body6.lr.ph, label %for.cond.cleanup5 + +for.body6.lr.ph: ; preds = %for.body + %mul = mul nsw i32 %call, %i.028 + br label %for.body6 + +for.cond.cleanup5.loopexit: ; preds = %for.body6 + br label %for.cond.cleanup5 + +for.cond.cleanup5: ; preds = %for.cond.cleanup5.loopexit, %for.body + %inc12 = add nuw nsw i32 %i.028, 1 + %exitcond29 = icmp ne i32 %inc12, %conv + br i1 %exitcond29, label %for.body, label %for.cond.cleanup.loopexit + +for.body6: ; preds = %for.body6.lr.ph, %for.body6 + %j.026 = phi i32 [ 0, %for.body6.lr.ph ], [ %inc, %for.body6 ] + %add = add nsw i32 %j.026, %mul + %arrayidx = getelementptr inbounds i32, i32* %A, i32 %add + %0 = load i32, i32* %arrayidx, align 4 + %mul7 = mul nsw i32 %0, %scale + %arrayidx10 = getelementptr inbounds i32, i32* %C, i32 %add + store i32 %mul7, i32* %arrayidx10, align 4 + %inc = add nuw nsw i32 %j.026, 1 + %exitcond = icmp ne i32 %inc, %call + br i1 %exitcond, label %for.body6, label %for.cond.cleanup5.loopexit +} + +; Inner loop has an early exit +define hidden void @test_6(i16 zeroext %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) { +entry: + %conv = zext i16 %N to i32 + %cmp39 = icmp eq i16 %N, 0 + br i1 %cmp39, label %for.cond.cleanup, label %for.body.us.preheader + +for.body.us.preheader: ; preds = %entry + br label %for.body.us + +for.body.us: ; preds = %for.body.us.preheader, %cleanup.us + %i.040.us = phi i32 [ %inc19.us, %cleanup.us ], [ 0, %for.body.us.preheader ] + %mul.us = mul nuw nsw i32 %i.040.us, %conv + br label %for.body7.us + +for.body7.us: ; preds = %for.body.us, %if.end.us + %j.038.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %if.end.us ] + %add.us = add nuw nsw i32 %j.038.us, %mul.us + %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %add.us + %0 = load i32, i32* %arrayidx.us, align 4 + %tobool.us = icmp eq i32 %0, 0 + br i1 %tobool.us, label %if.end.us, label %cleanup.us + +cleanup.us: ; preds = %if.end.us, %for.body7.us + %inc19.us = add nuw nsw i32 %i.040.us, 1 + %exitcond = icmp eq i32 %inc19.us, %conv + br i1 %exitcond, label %for.cond.cleanup, label %for.body.us + +if.end.us: ; preds = %for.body7.us + %arrayidx17.us = getelementptr inbounds i32, i32* %C, i32 %add.us + store i32 0, i32* %arrayidx17.us, align 4 + %inc.us = add nuw nsw i32 %j.038.us, 1 + %cmp4.us = icmp ult i32 %inc.us, %conv + br i1 %cmp4.us, label %for.body7.us, label %cleanup.us + +for.cond.cleanup: ; preds = %cleanup.us, %entry + ret void +} + +define hidden void @test_7(i16 zeroext %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) { +entry: + %conv = zext i16 %N to i32 + %cmp30 = icmp eq i16 %N, 0 + br i1 %cmp30, label %cleanup, label %for.body.us.preheader + +for.body.us.preheader: ; preds = %entry + br label %for.body.us + +for.body.us: ; preds = %for.body.us.preheader, %for.cond2.for.cond.cleanup6_crit_edge.us + %i.031.us = phi i32 [ %inc15.us, %for.cond2.for.cond.cleanup6_crit_edge.us ], [ 0, %for.body.us.preheader ] + %call.us = tail call i32 @get_int() #2 + %tobool.us = icmp eq i32 %call.us, 0 + br i1 %tobool.us, label %for.body7.lr.ph.us, label %cleanup + +for.body7.us: ; preds = %for.body7.us, %for.body7.lr.ph.us + %j.029.us = phi i32 [ 0, %for.body7.lr.ph.us ], [ %inc.us, %for.body7.us ] + %add.us = add nuw nsw i32 %j.029.us, %mul.us + %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %add.us + %0 = load i32, i32* %arrayidx.us, align 4 + %mul9.us = mul nsw i32 %0, %scale + %arrayidx13.us = getelementptr inbounds i32, i32* %C, i32 %add.us + store i32 %mul9.us, i32* %arrayidx13.us, align 4 + %inc.us = add nuw nsw i32 %j.029.us, 1 + %exitcond = icmp eq i32 %inc.us, %conv + br i1 %exitcond, label %for.cond2.for.cond.cleanup6_crit_edge.us, label %for.body7.us + +for.body7.lr.ph.us: ; preds = %for.body.us + %mul.us = mul nuw nsw i32 %i.031.us, %conv + br label %for.body7.us + +for.cond2.for.cond.cleanup6_crit_edge.us: ; preds = %for.body7.us + %inc15.us = add nuw nsw i32 %i.031.us, 1 + %cmp.us = icmp ult i32 %inc15.us, %conv + br i1 %cmp.us, label %for.body.us, label %cleanup + +cleanup: ; preds = %for.cond2.for.cond.cleanup6_crit_edge.us, %for.body.us, %entry + ret void +} + +; Step is not 1 +define i32 @test_8(i32 %val, i16* nocapture %A) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.inc6 + %i.018 = phi i32 [ 0, %entry ], [ %inc7, %for.inc6 ] + %mul = mul nuw nsw i32 %i.018, 20 + br label %for.body3 + +for.body3: ; preds = %for.body, %for.body3 + %j.017 = phi i32 [ 0, %for.body ], [ %inc, %for.body3 ] + %add = add nuw nsw i32 %j.017, %mul + %arrayidx = getelementptr inbounds i16, i16* %A, i32 %add + %0 = load i16, i16* %arrayidx, align 2 + %conv16 = zext i16 %0 to i32 + %add4 = add i32 %conv16, %val + %conv5 = trunc i32 %add4 to i16 + store i16 %conv5, i16* %arrayidx, align 2 + %inc = add nuw nsw i32 %j.017, 1 + %exitcond = icmp ne i32 %inc, 20 + br i1 %exitcond, label %for.body3, label %for.inc6 + +for.inc6: ; preds = %for.body3 + %inc7 = add nuw nsw i32 %i.018, 2 + %exitcond19 = icmp ne i32 %inc7, 10 + br i1 %exitcond19, label %for.body, label %for.end8 + +for.end8: ; preds = %for.inc6 + ret i32 10 +} + + +; Step is not 1 +define i32 @test_9(i32 %val, i16* nocapture %A) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.inc6 + %i.018 = phi i32 [ 0, %entry ], [ %inc7, %for.inc6 ] + %mul = mul nuw nsw i32 %i.018, 20 + br label %for.body3 + +for.body3: ; preds = %for.body, %for.body3 + %j.017 = phi i32 [ 0, %for.body ], [ %inc, %for.body3 ] + %add = add nuw nsw i32 %j.017, %mul + %arrayidx = getelementptr inbounds i16, i16* %A, i32 %add + %0 = load i16, i16* %arrayidx, align 2 + %conv16 = zext i16 %0 to i32 + %add4 = add i32 %conv16, %val + %conv5 = trunc i32 %add4 to i16 + store i16 %conv5, i16* %arrayidx, align 2 + %inc = add nuw nsw i32 %j.017, 2 + %exitcond = icmp ne i32 %inc, 20 + br i1 %exitcond, label %for.body3, label %for.inc6 + +for.inc6: ; preds = %for.body3 + %inc7 = add nuw nsw i32 %i.018, 1 + %exitcond19 = icmp ne i32 %inc7, 10 + br i1 %exitcond19, label %for.body, label %for.end8 + +for.end8: ; preds = %for.inc6 + ret i32 10 +} + + +; Outer loop conditional phi +define i32 @e() { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.end16 + %f.033 = phi i32 [ 0, %entry ], [ %inc18, %for.end16 ] + %g.032 = phi i32 [ undef, %entry ], [ %g.3.lcssa, %for.end16 ] + %.pr = add i32 10, 10 + %tobool29 = icmp eq i32 %.pr, 0 + br i1 %tobool29, label %for.end, label %for.body2.lr.ph + +for.body2.lr.ph: ; preds = %for.body + br label %for.cond1.for.end_crit_edge + +for.cond1.for.end_crit_edge: ; preds = %for.body2.lr.ph + br label %for.end + +for.end: ; preds = %for.cond1.for.end_crit_edge, %for.body + %g.1.lcssa = phi i32 [ 0, %for.cond1.for.end_crit_edge ], [ %g.032, %for.body ] + br label %for.body5 + +for.body5: ; preds = %for.end, %lor.end + %i.031 = phi i32 [ 0, %for.end ], [ %inc15, %lor.end ] + %g.230 = phi i32 [ %g.1.lcssa, %for.end ], [ %g.3, %lor.end ] + %0 = add i32 10, 10 + %1 = add i32 10, 10 + %tobool9 = icmp eq i32 %1, 0 + br i1 %tobool9, label %lor.rhs, label %lor.end + +lor.rhs: ; preds = %for.body5 + %2 = add i32 10, 10 + %call11 = add i32 10, 10 + %tobool12 = icmp ne i32 %call11, 0 + br label %lor.end + +lor.end: ; preds = %for.body5, %lor.rhs + %g.3 = phi i32 [ %g.230, %for.body5 ], [ %call11, %lor.rhs ] + %3 = phi i1 [ true, %for.body5 ], [ %tobool12, %lor.rhs ] + %lor.ext = zext i1 %3 to i32 + %inc15 = add nuw nsw i32 %i.031, 1 + %exitcond = icmp ne i32 %inc15, 9 + br i1 %exitcond, label %for.body5, label %for.end16 + +for.end16: ; preds = %lor.end + %g.3.lcssa = phi i32 [ %g.3, %lor.end ] + %inc18 = add nuw nsw i32 %f.033, 1 + %exitcond34 = icmp ne i32 %inc18, 7 + br i1 %exitcond34, label %for.body, label %for.end19 + +for.end19: ; preds = %for.end16 + ret i32 undef +} Index: llvm/test/Transforms/LoopFlatten/loop-flatten.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopFlatten/loop-flatten.ll @@ -0,0 +1,591 @@ +; RUN: opt < %s -S -loop-flatten -verify-loop-info -verify-dom-info -verify-scev -verify | FileCheck %s + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" + +; CHECK-LABEL: test1 +; Simple loop where the IV's is constant +define i32 @test1(i32 %val, i16* nocapture %A) { +entry: + br label %for.body +; CHECK: entry: +; CHECK: %flatten.tripcount = mul i32 20, 10 +; CHECK: br label %for.body + +for.body: ; preds = %entry, %for.inc6 + %i.018 = phi i32 [ 0, %entry ], [ %inc7, %for.inc6 ] + %mul = mul nuw nsw i32 %i.018, 20 + br label %for.body3 +; CHECK: for.body: +; CHECK: %i.018 = phi i32 [ 0, %entry ], [ %inc7, %for.inc6 ] +; CHECK: %mul = mul nuw nsw i32 %i.018, 20 +; CHECK: br label %for.body3 + +for.body3: ; preds = %for.body, %for.body3 + %j.017 = phi i32 [ 0, %for.body ], [ %inc, %for.body3 ] + %add = add nuw nsw i32 %j.017, %mul + %arrayidx = getelementptr inbounds i16, i16* %A, i32 %add + %0 = load i16, i16* %arrayidx, align 2 + %conv16 = zext i16 %0 to i32 + %add4 = add i32 %conv16, %val + %conv5 = trunc i32 %add4 to i16 + store i16 %conv5, i16* %arrayidx, align 2 + %inc = add nuw nsw i32 %j.017, 1 + %exitcond = icmp ne i32 %inc, 20 + br i1 %exitcond, label %for.body3, label %for.inc6 +; CHECK: for.body3: +; CHECK: %j.017 = phi i32 [ 0, %for.body ] +; CHECK: %add = add nuw nsw i32 %j.017, %mul +; CHECK: %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.018 +; CHECK: %0 = load i16, i16* %arrayidx, align 2 +; CHECK: %conv16 = zext i16 %0 to i32 +; CHECK: %add4 = add i32 %conv16, %val +; CHECK: %conv5 = trunc i32 %add4 to i16 +; CHECK: store i16 %conv5, i16* %arrayidx, align 2 +; CHECK: %inc = add nuw nsw i32 %j.017, 1 +; CHECK: %exitcond = icmp ne i32 %inc, 20 +; CHECK: br label %for.inc6 + +for.inc6: ; preds = %for.body3 + %inc7 = add nuw nsw i32 %i.018, 1 + %exitcond19 = icmp ne i32 %inc7, 10 + br i1 %exitcond19, label %for.body, label %for.end8 +; CHECK: for.inc6: +; CHECK: %inc7 = add nuw nsw i32 %i.018, 1 +; CHECK: %exitcond19 = icmp ne i32 %inc7, %flatten.tripcount +; CHECK: br i1 %exitcond19, label %for.body, label %for.end8 + +for.end8: ; preds = %for.inc6 + ret i32 10 +} + + +; CHECK-LABEL: test2 +; Same as above but non constant IV (which still cannot overflow) +define i32 @test2(i8 zeroext %I, i32 %val, i16* nocapture %A) { +entry: + %conv = zext i8 %I to i32 + %cmp26 = icmp eq i8 %I, 0 + br i1 %cmp26, label %for.end13, label %for.body.lr.ph.split.us + +for.body.lr.ph.split.us: ; preds = %entry + br label %for.body.us +; CHECK: for.body.lr.ph.split.us: +; CHECK: %flatten.tripcount = mul i32 %conv, %conv +; CHECK: br label %for.body.us + +for.body.us: ; preds = %for.cond2.for.inc11_crit_edge.us, %for.body.lr.ph.split.us + %i.027.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc12.us, %for.cond2.for.inc11_crit_edge.us ] + %mul.us = mul nuw nsw i32 %i.027.us, %conv + br label %for.body6.us +; CHECK: for.body.us: +; CHECK: %i.027.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc12.us, %for.cond2.for.inc11_crit_edge.us ] +; CHECK: %mul.us = mul nuw nsw i32 %i.027.us, %conv +; CHECK: br label %for.body6.us + +for.body6.us: ; preds = %for.body.us, %for.body6.us + %j.025.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %for.body6.us ] + %add.us = add nuw nsw i32 %j.025.us, %mul.us + %arrayidx.us = getelementptr inbounds i16, i16* %A, i32 %add.us + %0 = load i16, i16* %arrayidx.us, align 2 + %conv823.us = zext i16 %0 to i32 + %add9.us = add i32 %conv823.us, %val + %conv10.us = trunc i32 %add9.us to i16 + store i16 %conv10.us, i16* %arrayidx.us, align 2 + %inc.us = add nuw nsw i32 %j.025.us, 1 + %exitcond = icmp ne i32 %inc.us, %conv + br i1 %exitcond, label %for.body6.us, label %for.cond2.for.inc11_crit_edge.us +; CHECK: for.body6.us: +; CHECK: %j.025.us = phi i32 [ 0, %for.body.us ] +; CHECK: %add.us = add nuw nsw i32 %j.025.us, %mul.us +; CHECK: %arrayidx.us = getelementptr inbounds i16, i16* %A, i32 %i.027.us +; CHECK: %0 = load i16, i16* %arrayidx.us, align 2 +; CHECK: %conv823.us = zext i16 %0 to i32 +; CHECK: %add9.us = add i32 %conv823.us, %val +; CHECK: %conv10.us = trunc i32 %add9.us to i16 +; CHECK: store i16 %conv10.us, i16* %arrayidx.us, align 2 +; CHECK: %inc.us = add nuw nsw i32 %j.025.us, 1 +; CHECK: %exitcond = icmp ne i32 %inc.us, %conv +; CHECK: br label %for.cond2.for.inc11_crit_edge.us + +for.cond2.for.inc11_crit_edge.us: ; preds = %for.body6.us + %inc12.us = add nuw nsw i32 %i.027.us, 1 + %exitcond28 = icmp ne i32 %inc12.us, %conv + br i1 %exitcond28, label %for.body.us, label %for.end13.loopexit +; CHECK: for.cond2.for.inc11_crit_edge.us: ; preds = %for.body6.us +; CHECK: %inc12.us = add nuw nsw i32 %i.027.us, 1 +; CHECK: %exitcond28 = icmp ne i32 %inc12.us, %flatten.tripcount +; CHECK: br i1 %exitcond28, label %for.body.us, label %for.end13.loopexit + +for.end13.loopexit: ; preds = %for.cond2.for.inc11_crit_edge.us + br label %for.end13 + +for.end13: ; preds = %for.end13.loopexit, %entry + %i.0.lcssa = phi i32 [ 0, %entry ], [ %conv, %for.end13.loopexit ] + ret i32 %i.0.lcssa +} + + +; CHECK-LABEL: test3 +; Same as above, uses load to determine it cant overflow +define i32 @test3(i32 %N, i32 %val, i16* nocapture %A) local_unnamed_addr #0 { +entry: + %cmp21 = icmp eq i32 %N, 0 + br i1 %cmp21, label %for.end8, label %for.body.lr.ph.split.us + +for.body.lr.ph.split.us: ; preds = %entry + br label %for.body.us +; CHECK: for.body.lr.ph.split.us: +; CHECK: %flatten.tripcount = mul i32 %N, %N +; CHECK: br label %for.body.us + +for.body.us: ; preds = %for.cond1.for.inc6_crit_edge.us, %for.body.lr.ph.split.us + %i.022.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc7.us, %for.cond1.for.inc6_crit_edge.us ] + %mul.us = mul i32 %i.022.us, %N + br label %for.body3.us +; CHECK: for.body.us: +; CHECK: %i.022.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc7.us, %for.cond1.for.inc6_crit_edge.us ] +; CHECK: %mul.us = mul i32 %i.022.us, %N +; CHECK: br label %for.body3.us + +for.body3.us: ; preds = %for.body.us, %for.body3.us + %j.020.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %for.body3.us ] + %add.us = add i32 %j.020.us, %mul.us + %arrayidx.us = getelementptr inbounds i16, i16* %A, i32 %add.us + %0 = load i16, i16* %arrayidx.us, align 2 + %conv18.us = zext i16 %0 to i32 + %add4.us = add i32 %conv18.us, %val + %conv5.us = trunc i32 %add4.us to i16 + store i16 %conv5.us, i16* %arrayidx.us, align 2 + %inc.us = add nuw i32 %j.020.us, 1 + %exitcond = icmp ne i32 %inc.us, %N + br i1 %exitcond, label %for.body3.us, label %for.cond1.for.inc6_crit_edge.us +; CHECK: for.body3.us: +; CHECK: %j.020.us = phi i32 [ 0, %for.body.us ] +; CHECK: %add.us = add i32 %j.020.us, %mul.us +; CHECK: %arrayidx.us = getelementptr inbounds i16, i16* %A, i32 %i.022.us +; CHECK: %0 = load i16, i16* %arrayidx.us, align 2 +; CHECK: %conv18.us = zext i16 %0 to i32 +; CHECK: %add4.us = add i32 %conv18.us, %val +; CHECK: %conv5.us = trunc i32 %add4.us to i16 +; CHECK: store i16 %conv5.us, i16* %arrayidx.us, align 2 +; CHECK: %inc.us = add nuw i32 %j.020.us, 1 +; CHECK: %exitcond = icmp ne i32 %inc.us, %N +; CHECK: br label %for.cond1.for.inc6_crit_edge.us + +for.cond1.for.inc6_crit_edge.us: ; preds = %for.body3.us + %inc7.us = add nuw i32 %i.022.us, 1 + %exitcond23 = icmp ne i32 %inc7.us, %N + br i1 %exitcond23, label %for.body.us, label %for.end8.loopexit +; CHECK: for.cond1.for.inc6_crit_edge.us: +; CHECK: %inc7.us = add nuw i32 %i.022.us, 1 +; CHECK: %exitcond23 = icmp ne i32 %inc7.us, %flatten.tripcount +; CHECK: br i1 %exitcond23, label %for.body.us, label %for.end8.loopexit + +for.end8.loopexit: ; preds = %for.cond1.for.inc6_crit_edge.us + br label %for.end8 + +for.end8: ; preds = %for.end8.loopexit, %entry + %i.0.lcssa = phi i32 [ 0, %entry ], [ %N, %for.end8.loopexit ] + ret i32 %i.0.lcssa +} + + +; CHECK-LABEL: test4 +; Multiplication cannot overflow, so we can replace the original loop. +define void @test4(i16 zeroext %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) { +entry: + %conv = zext i16 %N to i32 + %cmp30 = icmp eq i16 %N, 0 + br i1 %cmp30, label %for.cond.cleanup, label %for.body.lr.ph.split.us +; CHECK: entry: +; CHECK: %[[LIMIT:.*]] = zext i16 %N to i32 +; CHECK: br i1 %{{.*}} label %for.cond.cleanup, label %for.body.lr.ph.split.us + +for.body.lr.ph.split.us: ; preds = %entry + br label %for.body.us +; CHECK: for.body.lr.ph.split.us: +; CHECK: %[[TRIPCOUNT:.*]] = mul i32 %[[LIMIT]], %[[LIMIT]] +; CHECK: br label %for.body.us + +for.body.us: ; preds = %for.cond2.for.cond.cleanup6_crit_edge.us, %for.body.lr.ph.split.us + %i.031.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc15.us, %for.cond2.for.cond.cleanup6_crit_edge.us ] + %mul.us = mul nuw nsw i32 %i.031.us, %conv + br label %for.body7.us +; CHECK: for.body.us: +; CHECK: %[[OUTER_IV:.*]] = phi i32 +; CHECK: br label %for.body7.us + +for.body7.us: ; preds = %for.body.us, %for.body7.us +; CHECK: for.body7.us: + %j.029.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %for.body7.us ] + %add.us = add nuw nsw i32 %j.029.us, %mul.us + %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %add.us +; CHECK: getelementptr inbounds i32, i32* %A, i32 %[[OUTER_IV]] + %0 = load i32, i32* %arrayidx.us, align 4 + %mul9.us = mul nsw i32 %0, %scale +; CHECK: getelementptr inbounds i32, i32* %C, i32 %[[OUTER_IV]] + %arrayidx13.us = getelementptr inbounds i32, i32* %C, i32 %add.us + store i32 %mul9.us, i32* %arrayidx13.us, align 4 + %inc.us = add nuw nsw i32 %j.029.us, 1 + %exitcond = icmp ne i32 %inc.us, %conv + br i1 %exitcond, label %for.body7.us, label %for.cond2.for.cond.cleanup6_crit_edge.us +; CHECK: br label %for.cond2.for.cond.cleanup6_crit_edge.us + +for.cond2.for.cond.cleanup6_crit_edge.us: ; preds = %for.body7.us + %inc15.us = add nuw nsw i32 %i.031.us, 1 + %exitcond32 = icmp ne i32 %inc15.us, %conv + br i1 %exitcond32, label %for.body.us, label %for.cond.cleanup.loopexit +; CHECK: for.cond2.for.cond.cleanup6_crit_edge.us: +; CHECK: br i1 %exitcond32, label %for.body.us, label %for.cond.cleanup.loopexit + +for.cond.cleanup.loopexit: ; preds = %for.cond2.for.cond.cleanup6_crit_edge.us + br label %for.cond.cleanup +; CHECK: for.cond.cleanup.loopexit: +; CHECK: br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void +; CHECK: for.cond.cleanup: +; CHECK: ret void +} + + +; CHECK-LABEL: test5 +define i32 @test5(i8 zeroext %I, i16 zeroext %J) { +entry: + %0 = lshr i8 %I, 1 + %div = zext i8 %0 to i32 + %cmp30 = icmp eq i8 %0, 0 + br i1 %cmp30, label %for.cond.cleanup, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + %1 = lshr i16 %J, 1 + %div5 = zext i16 %1 to i32 + %cmp627 = icmp eq i16 %1, 0 + br i1 %cmp627, label %for.body.lr.ph.split, label %for.body.lr.ph.split.us + +for.body.lr.ph.split.us: ; preds = %for.body.lr.ph + br label %for.body.us +; CHECK: for.body.lr.ph.split.us: +; CHECK: %flatten.tripcount = mul i32 %div5, %div +; CHECK: br label %for.body.us + +for.body.us: ; preds = %for.cond3.for.cond.cleanup8_crit_edge.us, %for.body.lr.ph.split.us + %i.032.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc13.us, %for.cond3.for.cond.cleanup8_crit_edge.us ] + %x.031.us = phi i32 [ 1, %for.body.lr.ph.split.us ], [ %xor.us.lcssa, %for.cond3.for.cond.cleanup8_crit_edge.us ] + br label %for.body9.us +; CHECK: for.body.us: +; CHECK: %i.032.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc13.us, %for.cond3.for.cond.cleanup8_crit_edge.us ] +; CHECK: %x.031.us = phi i32 [ 1, %for.body.lr.ph.split.us ], [ %xor.us.lcssa, %for.cond3.for.cond.cleanup8_crit_edge.us ] +; CHECK: br label %for.body9.us + +for.body9.us: ; preds = %for.body.us, %for.body9.us + %j.029.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %for.body9.us ] + %x.128.us = phi i32 [ %x.031.us, %for.body.us ], [ %xor.us, %for.body9.us ] + %call.us = tail call i32 @func(i32 1) + %sub.us = sub nsw i32 %call.us, %x.128.us + %xor.us = xor i32 %sub.us, %x.128.us + %inc.us = add nuw nsw i32 %j.029.us, 1 + %cmp6.us = icmp ult i32 %inc.us, %div5 + br i1 %cmp6.us, label %for.body9.us, label %for.cond3.for.cond.cleanup8_crit_edge.us +; CHECK: for.body9.us: +; CHECK: %j.029.us = phi i32 [ 0, %for.body.us ] +; CHECK: %x.128.us = phi i32 [ %x.031.us, %for.body.us ] +; CHECK: %call.us = tail call i32 @func(i32 1) +; CHECK: %sub.us = sub nsw i32 %call.us, %x.128.us +; CHECK: %xor.us = xor i32 %sub.us, %x.128.us +; CHECK: %inc.us = add nuw nsw i32 %j.029.us, 1 +; CHECK: %cmp6.us = icmp ult i32 %inc.us, %div5 +; CHECK: br label %for.cond3.for.cond.cleanup8_crit_edge.us + +for.cond3.for.cond.cleanup8_crit_edge.us: ; preds = %for.body9.us + %xor.us.lcssa = phi i32 [ %xor.us, %for.body9.us ] + %inc13.us = add nuw nsw i32 %i.032.us, 1 + %cmp.us = icmp ult i32 %inc13.us, %div + br i1 %cmp.us, label %for.body.us, label %for.cond.cleanup.loopexit +; CHECK: for.cond3.for.cond.cleanup8_crit_edge.us: +; CHECK: %xor.us.lcssa = phi i32 [ %xor.us, %for.body9.us ] +; CHECK: %inc13.us = add nuw nsw i32 %i.032.us, 1 +; CHECK: %cmp.us = icmp ult i32 %inc13.us, %flatten.tripcount +; CHECK: br i1 %cmp.us, label %for.body.us, label %for.cond.cleanup.loopexit + +for.body.lr.ph.split: ; preds = %for.body.lr.ph + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.cond3.for.cond.cleanup8_crit_edge.us + %xor.us.lcssa.lcssa = phi i32 [ %xor.us.lcssa, %for.cond3.for.cond.cleanup8_crit_edge.us ] + br label %for.cond.cleanup + +for.cond.cleanup.loopexit34: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit34, %for.cond.cleanup.loopexit, %entry + %x.0.lcssa = phi i32 [ 1, %entry ], [ %xor.us.lcssa.lcssa, %for.cond.cleanup.loopexit ], [ 1, %for.cond.cleanup.loopexit34 ] + ret i32 %x.0.lcssa + +for.body: ; preds = %for.body.lr.ph.split, %for.body + %i.032 = phi i32 [ 0, %for.body.lr.ph.split ], [ %inc13, %for.body ] + %inc13 = add nuw nsw i32 %i.032, 1 + %cmp = icmp ult i32 %inc13, %div + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit34 +} + + +; CHECK-LABEL: test6 +define i32 @test6(i8 zeroext %I, i16 zeroext %J) { +entry: + %0 = lshr i8 %I, 1 + %div = zext i8 %0 to i32 + %cmp30 = icmp eq i8 %0, 0 + br i1 %cmp30, label %for.cond.cleanup, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + %1 = lshr i16 %J, 1 + %div5 = zext i16 %1 to i32 + %cmp627 = icmp eq i16 %1, 0 + br i1 %cmp627, label %for.body.lr.ph.split, label %for.body.lr.ph.split.us + +for.body.lr.ph.split.us: ; preds = %for.body.lr.ph + br label %for.body.us +; CHECK: for.body.lr.ph.split.us: +; CHECK: %flatten.tripcount = mul i32 %div5, %div +; CHECK: br label %for.body.us + +for.body.us: ; preds = %for.cond3.for.cond.cleanup8_crit_edge.us, %for.body.lr.ph.split.us + %i.032.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc13.us, %for.cond3.for.cond.cleanup8_crit_edge.us ] + %x.031.us = phi i32 [ 1, %for.body.lr.ph.split.us ], [ %xor.us.lcssa, %for.cond3.for.cond.cleanup8_crit_edge.us ] + %mul.us = mul nuw nsw i32 %i.032.us, %div5 + br label %for.body9.us +; CHECK: for.body.us: +; CHECK: %i.032.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc13.us, %for.cond3.for.cond.cleanup8_crit_edge.us ] +; CHECK: %x.031.us = phi i32 [ 1, %for.body.lr.ph.split.us ], [ %xor.us.lcssa, %for.cond3.for.cond.cleanup8_crit_edge.us ] +; CHECK: %mul.us = mul nuw nsw i32 %i.032.us, %div5 +; CHECK: br label %for.body9.us + +for.body9.us: ; preds = %for.body.us, %for.body9.us + %j.029.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %for.body9.us ] + %x.128.us = phi i32 [ %x.031.us, %for.body.us ], [ %xor.us, %for.body9.us ] + %add.us = add nuw nsw i32 %j.029.us, %mul.us + %call.us = tail call i32 @func(i32 %add.us) + %sub.us = sub nsw i32 %call.us, %x.128.us + %xor.us = xor i32 %sub.us, %x.128.us + %inc.us = add nuw nsw i32 %j.029.us, 1 + %cmp6.us = icmp ult i32 %inc.us, %div5 + br i1 %cmp6.us, label %for.body9.us, label %for.cond3.for.cond.cleanup8_crit_edge.us +; CHECK: for.body9.us: +; CHECK: %j.029.us = phi i32 [ 0, %for.body.us ] +; CHECK: %x.128.us = phi i32 [ %x.031.us, %for.body.us ] +; CHECK: %add.us = add nuw nsw i32 %j.029.us, %mul.us +; CHECK: %call.us = tail call i32 @func(i32 %i.032.us) +; CHECK: %sub.us = sub nsw i32 %call.us, %x.128.us +; CHECK: %xor.us = xor i32 %sub.us, %x.128.us +; CHECK: %inc.us = add nuw nsw i32 %j.029.us, 1 +; CHECK: %cmp6.us = icmp ult i32 %inc.us, %div5 +; CHECK: br label %for.cond3.for.cond.cleanup8_crit_edge.us + +for.cond3.for.cond.cleanup8_crit_edge.us: ; preds = %for.body9.us + %xor.us.lcssa = phi i32 [ %xor.us, %for.body9.us ] + %inc13.us = add nuw nsw i32 %i.032.us, 1 + %cmp.us = icmp ult i32 %inc13.us, %div + br i1 %cmp.us, label %for.body.us, label %for.cond.cleanup.loopexit +; CHECK: for.cond3.for.cond.cleanup8_crit_edge.us: +; CHECK: %xor.us.lcssa = phi i32 [ %xor.us, %for.body9.us ] +; CHECK: %inc13.us = add nuw nsw i32 %i.032.us, 1 +; CHECK: %cmp.us = icmp ult i32 %inc13.us, %flatten.tripcount +; CHECK: br i1 %cmp.us, label %for.body.us, label %for.cond.cleanup.loopexit + +for.body.lr.ph.split: ; preds = %for.body.lr.ph + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.cond3.for.cond.cleanup8_crit_edge.us + %xor.us.lcssa.lcssa = phi i32 [ %xor.us.lcssa, %for.cond3.for.cond.cleanup8_crit_edge.us ] + br label %for.cond.cleanup + +for.cond.cleanup.loopexit34: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit34, %for.cond.cleanup.loopexit, %entry + %x.0.lcssa = phi i32 [ 1, %entry ], [ %xor.us.lcssa.lcssa, %for.cond.cleanup.loopexit ], [ 1, %for.cond.cleanup.loopexit34 ] + ret i32 %x.0.lcssa + +for.body: ; preds = %for.body.lr.ph.split, %for.body + %i.032 = phi i32 [ 0, %for.body.lr.ph.split ], [ %inc13, %for.body ] + %inc13 = add nuw nsw i32 %i.032, 1 + %cmp = icmp ult i32 %inc13, %div + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit34 +} + +; CHECK-LABEL: test7 +; Various inner phis and conditions which we can still work with +define signext i16 @test7(i32 %I, i32 %J, i32* nocapture readonly %C, i16 signext %limit) { +entry: + %cmp43 = icmp eq i32 %J, 0 + br i1 %cmp43, label %for.end17, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + %conv = sext i16 %limit to i32 + br label %for.body.us +; CHECK: for.body.lr.ph: +; CHECK: %conv = sext i16 %limit to i32 +; CHECK: %flatten.tripcount = mul i32 %J, %J +; CHECK: br label %for.body.us + +for.body.us: ; preds = %for.cond1.for.inc15_crit_edge.us, %for.body.lr.ph + %i.047.us = phi i32 [ 0, %for.body.lr.ph ], [ %inc16.us, %for.cond1.for.inc15_crit_edge.us ] + %ret.046.us = phi i16 [ 0, %for.body.lr.ph ], [ %ret.2.us.lcssa, %for.cond1.for.inc15_crit_edge.us ] + %prev.045.us = phi i32 [ 0, %for.body.lr.ph ], [ %.lcssa, %for.cond1.for.inc15_crit_edge.us ] + %tmp.044.us = phi i32 [ 0, %for.body.lr.ph ], [ %tmp.2.us.lcssa, %for.cond1.for.inc15_crit_edge.us ] + %mul.us = mul i32 %i.047.us, %J + br label %for.body3.us +; CHECK: for.body.us: +; CHECK: %i.047.us = phi i32 [ 0, %for.body.lr.ph ], [ %inc16.us, %for.cond1.for.inc15_crit_edge.us ] +; CHECK: %ret.046.us = phi i16 [ 0, %for.body.lr.ph ], [ %ret.2.us.lcssa, %for.cond1.for.inc15_crit_edge.us ] +; CHECK: %prev.045.us = phi i32 [ 0, %for.body.lr.ph ], [ %.lcssa, %for.cond1.for.inc15_crit_edge.us ] +; CHECK: %tmp.044.us = phi i32 [ 0, %for.body.lr.ph ], [ %tmp.2.us.lcssa, %for.cond1.for.inc15_crit_edge.us ] +; CHECK: %mul.us = mul i32 %i.047.us, %J +; CHECK: br label %for.body3.us + +for.body3.us: ; preds = %for.body.us, %if.end.us + %j.040.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %if.end.us ] + %ret.139.us = phi i16 [ %ret.046.us, %for.body.us ], [ %ret.2.us, %if.end.us ] + %prev.138.us = phi i32 [ %prev.045.us, %for.body.us ], [ %0, %if.end.us ] + %tmp.137.us = phi i32 [ %tmp.044.us, %for.body.us ], [ %tmp.2.us, %if.end.us ] + %add.us = add i32 %j.040.us, %mul.us + %arrayidx.us = getelementptr inbounds i32, i32* %C, i32 %add.us + %0 = load i32, i32* %arrayidx.us, align 4 + %add4.us = add nsw i32 %0, %tmp.137.us + %cmp5.us = icmp sgt i32 %add4.us, %conv + br i1 %cmp5.us, label %if.then.us, label %if.else.us +; CHECK: for.body3.us: +; CHECK: %j.040.us = phi i32 [ 0, %for.body.us ] +; CHECK: %ret.139.us = phi i16 [ %ret.046.us, %for.body.us ] +; CHECK: %prev.138.us = phi i32 [ %prev.045.us, %for.body.us ] +; CHECK: %tmp.137.us = phi i32 [ %tmp.044.us, %for.body.us ] +; CHECK: %add.us = add i32 %j.040.us, %mul.us +; CHECK: %arrayidx.us = getelementptr inbounds i32, i32* %C, i32 %i.047.us +; CHECK: %0 = load i32, i32* %arrayidx.us, align 4 +; CHECK: %add4.us = add nsw i32 %0, %tmp.137.us +; CHECK: %cmp5.us = icmp sgt i32 %add4.us, %conv +; CHECK: br i1 %cmp5.us, label %if.then.us, label %if.else.us + +if.else.us: ; preds = %for.body3.us + %cmp10.us = icmp sgt i32 %0, %prev.138.us + %cond.us = zext i1 %cmp10.us to i32 + %conv1235.us = zext i16 %ret.139.us to i32 + %add13.us = add nuw nsw i32 %cond.us, %conv1235.us + br label %if.end.us +; CHECK: if.else.us: +; CHECK: %cmp10.us = icmp sgt i32 %0, %prev.138.us +; CHECK: %cond.us = zext i1 %cmp10.us to i32 +; CHECK: %conv1235.us = zext i16 %ret.139.us to i32 +; CHECK: %add13.us = add nuw nsw i32 %cond.us, %conv1235.us +; CHECK: br label %if.end.us + +if.then.us: ; preds = %for.body3.us + %conv7.us = sext i16 %ret.139.us to i32 + %add8.us = add nsw i32 %conv7.us, 10 + br label %if.end.us +; CHECK: if.then.us: +; CHECK: %conv7.us = sext i16 %ret.139.us to i32 +; CHECK: %add8.us = add nsw i32 %conv7.us, 10 +; CHECK: br label %if.end.us + +if.end.us: ; preds = %if.then.us, %if.else.us + %tmp.2.us = phi i32 [ 0, %if.then.us ], [ %add4.us, %if.else.us ] + %ret.2.in.us = phi i32 [ %add8.us, %if.then.us ], [ %add13.us, %if.else.us ] + %ret.2.us = trunc i32 %ret.2.in.us to i16 + %inc.us = add nuw i32 %j.040.us, 1 + %exitcond = icmp ne i32 %inc.us, %J + br i1 %exitcond, label %for.body3.us, label %for.cond1.for.inc15_crit_edge.us +; CHECK: if.end.us: +; CHECK: %tmp.2.us = phi i32 [ 0, %if.then.us ], [ %add4.us, %if.else.us ] +; CHECK: %ret.2.in.us = phi i32 [ %add8.us, %if.then.us ], [ %add13.us, %if.else.us ] +; CHECK: %ret.2.us = trunc i32 %ret.2.in.us to i16 +; CHECK: %inc.us = add nuw i32 %j.040.us, 1 +; CHECK: %exitcond = icmp ne i32 %inc.us, %J +; CHECK: br label %for.cond1.for.inc15_crit_edge.us + +for.cond1.for.inc15_crit_edge.us: ; preds = %if.end.us + %tmp.2.us.lcssa = phi i32 [ %tmp.2.us, %if.end.us ] + %ret.2.us.lcssa = phi i16 [ %ret.2.us, %if.end.us ] + %.lcssa = phi i32 [ %0, %if.end.us ] + %inc16.us = add nuw i32 %i.047.us, 1 + %exitcond49 = icmp ne i32 %inc16.us, %J + br i1 %exitcond49, label %for.body.us, label %for.end17.loopexit +; CHECK: for.cond1.for.inc15_crit_edge.us: +; CHECK: %tmp.2.us.lcssa = phi i32 [ %tmp.2.us, %if.end.us ] +; CHECK: %ret.2.us.lcssa = phi i16 [ %ret.2.us, %if.end.us ] +; CHECK: %.lcssa = phi i32 [ %0, %if.end.us ] +; CHECK: %inc16.us = add nuw i32 %i.047.us, 1 +; CHECK: %exitcond49 = icmp ne i32 %inc16.us, %flatten.tripcount +; CHECK: br i1 %exitcond49, label %for.body.us, label %for.end17.loopexit + +for.end17.loopexit: ; preds = %for.cond1.for.inc15_crit_edge.us + %ret.2.us.lcssa.lcssa = phi i16 [ %ret.2.us.lcssa, %for.cond1.for.inc15_crit_edge.us ] + br label %for.end17 + +for.end17: ; preds = %for.end17.loopexit, %entry + %ret.0.lcssa = phi i16 [ 0, %entry ], [ %ret.2.us.lcssa.lcssa, %for.end17.loopexit ] + ret i16 %ret.0.lcssa +} + +; CHECK-LABEL: test8 +; Same as test1, but with different continue block order +; (uses icmp eq and loops on false) +define i32 @test8(i32 %val, i16* nocapture %A) { +entry: + br label %for.body +; CHECK: entry: +; CHECK: %flatten.tripcount = mul i32 20, 10 +; CHECK: br label %for.body + +for.body: ; preds = %entry, %for.inc6 + %i.018 = phi i32 [ 0, %entry ], [ %inc7, %for.inc6 ] + %mul = mul nuw nsw i32 %i.018, 20 + br label %for.body3 +; CHECK: for.body: +; CHECK: %i.018 = phi i32 [ 0, %entry ], [ %inc7, %for.inc6 ] +; CHECK: %mul = mul nuw nsw i32 %i.018, 20 +; CHECK: br label %for.body3 + +for.body3: ; preds = %for.body, %for.body3 + %j.017 = phi i32 [ 0, %for.body ], [ %inc, %for.body3 ] + %add = add nuw nsw i32 %j.017, %mul + %arrayidx = getelementptr inbounds i16, i16* %A, i32 %add + %0 = load i16, i16* %arrayidx, align 2 + %conv16 = zext i16 %0 to i32 + %add4 = add i32 %conv16, %val + %conv5 = trunc i32 %add4 to i16 + store i16 %conv5, i16* %arrayidx, align 2 + %inc = add nuw nsw i32 %j.017, 1 + %exitcond = icmp eq i32 %inc, 20 + br i1 %exitcond, label %for.inc6, label %for.body3 +; CHECK: for.body3: +; CHECK: %j.017 = phi i32 [ 0, %for.body ] +; CHECK: %add = add nuw nsw i32 %j.017, %mul +; CHECK: %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.018 +; CHECK: %0 = load i16, i16* %arrayidx, align 2 +; CHECK: %conv16 = zext i16 %0 to i32 +; CHECK: %add4 = add i32 %conv16, %val +; CHECK: %conv5 = trunc i32 %add4 to i16 +; CHECK: store i16 %conv5, i16* %arrayidx, align 2 +; CHECK: %inc = add nuw nsw i32 %j.017, 1 +; CHECK: %exitcond = icmp eq i32 %inc, 20 +; CHECK: br label %for.inc6 + +for.inc6: ; preds = %for.body3 + %inc7 = add nuw nsw i32 %i.018, 1 + %exitcond19 = icmp eq i32 %inc7, 10 + br i1 %exitcond19, label %for.end8, label %for.body +; CHECK: for.inc6: +; CHECK: %inc7 = add nuw nsw i32 %i.018, 1 +; CHECK: %exitcond19 = icmp eq i32 %inc7, %flatten.tripcount +; CHECK: br i1 %exitcond19, label %for.end8, label %for.body + +for.end8: ; preds = %for.inc6 + ret i32 10 +} + + +declare i32 @func(i32) +