Index: docs/Passes.rst =================================================================== --- docs/Passes.rst +++ docs/Passes.rst @@ -780,6 +780,14 @@ access for the first iteration, and then creating a new GEP instruction in the loop to increment the value by the appropriate amount. +``-loop-exit-values``: Optimize Loop Exit Values +------------------------------------------------ + +This optimization finds loop exit values reevaluated after the loop +execution and replaces them by the corresponding exit values if they are +available. Such sequencies can arise after SimplifyIndVars and +LoopStrengthReduce passes. This pass should be run after LoopStrengthReduce. + ``-loop-rotate``: Rotate Loops ------------------------------ Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -167,6 +167,7 @@ void initializeLoopRotatePass(PassRegistry&); void initializeLoopSimplifyPass(PassRegistry&); void initializeLoopStrengthReducePass(PassRegistry&); +void initializeLoopExitValuesPass(PassRegistry&); void initializeGlobalMergePass(PassRegistry&); void initializeLoopRerollPass(PassRegistry&); void initializeLoopUnrollPass(PassRegistry&); Index: include/llvm/LinkAllPasses.h =================================================================== --- include/llvm/LinkAllPasses.h +++ include/llvm/LinkAllPasses.h @@ -106,6 +106,7 @@ (void) llvm::createLoopInterchangePass(); (void) llvm::createLoopSimplifyPass(); (void) llvm::createLoopStrengthReducePass(); + (void) llvm::createLoopExitValuesPass(); (void) llvm::createLoopRerollPass(); (void) llvm::createLoopUnrollPass(); (void) llvm::createLoopUnswitchPass(); Index: include/llvm/Transforms/Scalar.h =================================================================== --- include/llvm/Transforms/Scalar.h +++ include/llvm/Transforms/Scalar.h @@ -156,6 +156,15 @@ //===----------------------------------------------------------------------===// // +// LoopExitValues - This pass finds loop exit values reevaluated after the +// loop execution and replaces them by the corresponding values if they are +// available. Such sequencies often arise after SimplifyIndVars + +// LoopStrengthReduce. This pass should be run after LoopStrengthReduce. +// +Pass *createLoopExitValuesPass(); + +//===----------------------------------------------------------------------===// +// // GlobalMerge - This pass merges internal (by default) globals into structs // to enable reuse of a base pointer by indexed addressing modes. // It can also be configured to focus on size optimizations only. Index: lib/CodeGen/Passes.cpp =================================================================== --- lib/CodeGen/Passes.cpp +++ lib/CodeGen/Passes.cpp @@ -69,6 +69,8 @@ cl::desc("Disable Machine Sinking")); static cl::opt DisableLSR("disable-lsr", cl::Hidden, cl::desc("Disable Loop Strength Reduction Pass")); +static cl::opt DisableLoopExitVals("disable-loop-exit-vals", cl::Hidden, + cl::desc("Disable LoopExitValues pass")); static cl::opt DisableConstantHoisting("disable-constant-hoisting", cl::Hidden, cl::desc("Disable ConstantHoisting")); static cl::opt DisableCGP("disable-cgp", cl::Hidden, @@ -83,6 +85,8 @@ cl::init(false)); static cl::opt PrintLSR("print-lsr-output", cl::Hidden, cl::desc("Print LLVM IR produced by the loop-reduce pass")); +static cl::opt PrintLoopExitValues("print-loop-exit-values-output", cl::Hidden, + cl::desc("Print LLVM IR produced by the loop exit value optimization pass")); static cl::opt PrintISelInput("print-isel-input", cl::Hidden, cl::desc("Print LLVM IR input to isel pass")); static cl::opt PrintGCInfo("print-gc", cl::Hidden, @@ -401,6 +405,12 @@ addPass(createPrintFunctionPass(dbgs(), "\n\n*** Code after LSR ***\n")); } + if (getOptLevel() != CodeGenOpt::None && !DisableLoopExitVals) { + addPass(createLoopExitValuesPass()); + if (PrintLoopExitValues) + addPass(createPrintFunctionPass(dbgs(), "\n\n*** Code after Loop Exit Values Optimization **\n")); + } + // Run GC lowering passes for builtin collectors // TODO: add a pass insertion point here addPass(createGCLoweringPass()); Index: lib/Transforms/Scalar/CMakeLists.txt =================================================================== --- lib/Transforms/Scalar/CMakeLists.txt +++ lib/Transforms/Scalar/CMakeLists.txt @@ -31,6 +31,7 @@ MemCpyOptimizer.cpp MergedLoadStoreMotion.cpp NaryReassociate.cpp + LoopExitValues.cpp PartiallyInlineLibCalls.cpp PlaceSafepoints.cpp Reassociate.cpp Index: lib/Transforms/Scalar/LoopExitValues.cpp =================================================================== --- /dev/null +++ lib/Transforms/Scalar/LoopExitValues.cpp @@ -0,0 +1,191 @@ +//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This optimization finds loop exit values reevaluated after the loop +// execution and replaces them by the corresponding exit values if they are +// available. Such sequences can arise after SimplifyIndVars + +// LoopStrengthReduce passes. This pass should be run after LoopStrengthReduce. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/IVUsers.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/Local.h" + +using namespace llvm; + +#define DEBUG_TYPE "loop-exit-values" + +/// If any of the instructions is the specified set are trivially dead, delete +/// them and see if this makes any of their operands subsequently dead. +/// TODO: This function is identical to one in LoopStrengthReduce. Move these +/// TODO: to one common location. +static bool +DeleteTriviallyDeadInstructions(SmallVectorImpl &DeadInsts) { + bool Changed = false; + + while (!DeadInsts.empty()) { + Value *V = DeadInsts.pop_back_val(); + Instruction *I = dyn_cast_or_null(V); + + if (!I || !isInstructionTriviallyDead(I)) + continue; + + for (Use &O : I->operands()) + if (Instruction *U = dyn_cast(O)) { + O = nullptr; + if (U->use_empty()) + DeadInsts.emplace_back(U); + } + + I->eraseFromParent(); + Changed = true; + } + + return Changed; +} + +static void collectNestedLoops(Loop *L, std::vector &Loops) { + Loops.push_back(L); + for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) + collectNestedLoops(*I, Loops); +} + +static void collectLoops(LoopInfo &LI, std::vector &Loops) { + for (LoopInfo::iterator I = LI.begin(), E = LI.end(); I != E; ++I) + collectNestedLoops(*I, Loops); +} + +namespace { + +class LoopExitValues : public FunctionPass { +public: + static char ID; // Pass ID, replacement for typeid + LoopExitValues(); + +private: + bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; +}; + +} // anonymous + +char LoopExitValues::ID = 0; +INITIALIZE_PASS_BEGIN(LoopExitValues, "loop-exit-values", + "Optimize loop exit values", false, false) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_END(LoopExitValues, "loop-exit-values", + "Optimize loop exit values", false, false) + +Pass *llvm::createLoopExitValuesPass() { + return new LoopExitValues(); +} + +LoopExitValues::LoopExitValues() + : FunctionPass(ID) { + initializeLoopExitValuesPass(*PassRegistry::getPassRegistry()); +} + +bool LoopExitValues::runOnFunction(Function &F) { + ScalarEvolution &SE = getAnalysis().getSE(); + DominatorTree &DT = getAnalysis().getDomTree(); + LoopInfo &LI = getAnalysis().getLoopInfo(); + + std::vector Loops; + collectLoops(LI, Loops); + + // Collect exit values available after each of the loops + typedef DenseMap >SCEVToInstMap; + SCEVToInstMap SCEVToInstr; + for (Loop* L : Loops) { + for (BasicBlock* BB : L->getBlocks()) { + for (BasicBlock::iterator I : *BB) { + if (!SE.isSCEVable(I->getType())) + continue; + + const SCEV *ExitValue = SE.getSCEVAtScope(SE.getSCEV(&*I), + L->getParentLoop()); + if (!SE.isLoopInvariant(ExitValue, L) || + ExitValue->getSCEVType() == scUnknown || + ExitValue->getSCEVType() == scCouldNotCompute) + continue; + + SCEVToInstr[ExitValue].push_back(&*I); + } + } + } + if (SCEVToInstr.empty()) + return false; + + // Find similar values + bool Changed = false; + SmallVector DeadInsts; + for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) { + if (!SE.isSCEVable(I->getType())) + continue; + + // Since SCEVs are unique, we may use simple pointer comparison here. + const SCEV *SV = SE.getSCEV(&*I); + SCEVToInstMap::const_iterator iter = SCEVToInstr.find(SV); + if (iter == SCEVToInstr.end()) + continue; + + for (Instruction *Cand : iter->second) { + if (not DT.properlyDominates(Cand->getParent(), I->getParent())) + continue; + + // Try to replace I with the other instruction. + Instruction *NewInstr = Cand; + if (NewInstr->getType() != I->getType()) { + // If either instruction is a bitcast, that's ok, just insert a + // bitcast. Otherwise, some strange stuff detected, so do not + // optimize. + if (isa(NewInstr) || isa(&*I)) + NewInstr = + new BitCastInst(NewInstr, I->getType(), I->getName(), &*I); + else + NewInstr = nullptr; + } + + if (NewInstr == nullptr) + continue; + + DEBUG(dbgs() << "LoopExitValues: Replacing instruction: "); + DEBUG(I->dump()); + DEBUG(dbgs() << " with SCEV equivalent: "); + DEBUG(Cand->dump()); + Changed = true; + I->replaceAllUsesWith(NewInstr); + DeadInsts.push_back(&*I); + break; + } + } + + DeleteTriviallyDeadInstructions(DeadInsts); + return Changed; +} + +void LoopExitValues::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); +} + Index: lib/Transforms/Scalar/Scalar.cpp =================================================================== --- lib/Transforms/Scalar/Scalar.cpp +++ lib/Transforms/Scalar/Scalar.cpp @@ -53,6 +53,7 @@ initializeLoopInterchangePass(Registry); initializeLoopRotatePass(Registry); initializeLoopStrengthReducePass(Registry); + initializeLoopExitValuesPass(Registry); initializeLoopRerollPass(Registry); initializeLoopUnrollPass(Registry); initializeLoopUnswitchPass(Registry); Index: test/CodeGen/Generic/stop-after.ll =================================================================== --- test/CodeGen/Generic/stop-after.ll +++ test/CodeGen/Generic/stop-after.ll @@ -6,6 +6,10 @@ ; STOP-NEXT: Machine Function Analysis ; STOP-NEXT: MIR Printing Pass -; START: -machine-branch-prob -gc-lowering +; START: -machine-branch-prob -domtree -loops -scalar-evolution -loop-exit-values ; START: FunctionPass Manager -; START-NEXT: Lower Garbage Collection Instructions +; START-NEXT: Dominator Tree Construction +; START-NEXT: Natural Loop Information +; START-NEXT: Scalar Evolution Analysis +; START-NEXT: Optimize loop exit values + Index: test/CodeGen/X86/lsr-loop-exit-cond.ll =================================================================== --- test/CodeGen/X86/lsr-loop-exit-cond.ll +++ test/CodeGen/X86/lsr-loop-exit-cond.ll @@ -2,7 +2,7 @@ ; RUN: llc -mtriple=x86_64-darwin -mcpu=atom < %s | FileCheck -check-prefix=ATOM %s ; CHECK-LABEL: t: -; CHECK: movl (%r9,%rax,4), %e{{..}} +; CHECK: movl (%r9,%{{.+}},4), %e{{..}} ; CHECK-NEXT: decq ; CHECK-NEXT: jne Index: test/CodeGen/X86/pr11202.ll =================================================================== --- test/CodeGen/X86/pr11202.ll +++ test/CodeGen/X86/pr11202.ll @@ -15,5 +15,5 @@ br label %l1 } -; CHECK: .Ltmp0: # Address of block that was removed by CodeGen +; CHECK: .Ltmp0: ; CHECK: .quad .Ltmp0 Index: test/Transforms/LoopExitValues/matrix_mul.ll =================================================================== --- /dev/null +++ test/Transforms/LoopExitValues/matrix_mul.ll @@ -0,0 +1,51 @@ +; RUN: llc %s -print-lsr-output -print-loop-exit-values-output -o /dev/null 2>&1 | FileCheck %s + +target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" +target triple = "i686-unknown-unknown" + +define void @matrix_mul(i32 %Size, i32* nocapture %Dst, i32* nocapture readonly %Src, i32 %Val) { +entry: + %cmp.25 = icmp eq i32 %Size, 0 + br i1 %cmp.25, label %for.cond.cleanup, label %for.body.4.lr.ph + +for.body.4.lr.ph: ; preds = %entry, %for.cond.cleanup.3 + %Outer.026 = phi i32 [ %inc10, %for.cond.cleanup.3 ], [ 0, %entry ] + %mul = mul i32 %Outer.026, %Size + br label %for.body.4 + +for.cond.cleanup: ; preds = %for.cond.cleanup.3, %entry + ret void + +for.cond.cleanup.3: ; preds = %for.body.4 + %inc10 = add nuw nsw i32 %Outer.026, 1 + %exitcond27 = icmp eq i32 %inc10, %Size + br i1 %exitcond27, label %for.cond.cleanup, label %for.body.4.lr.ph + +for.body.4: ; preds = %for.body.4, %for.body.4.lr.ph + %Inner.024 = phi i32 [ 0, %for.body.4.lr.ph ], [ %inc, %for.body.4 ] + %add = add i32 %Inner.024, %mul + %arrayidx = getelementptr inbounds i32, i32* %Src, i32 %add + %0 = load i32, i32* %arrayidx, align 4, !tbaa !1 + %mul5 = mul i32 %0, %Val + %arrayidx8 = getelementptr inbounds i32, i32* %Dst, i32 %add + store i32 %mul5, i32* %arrayidx8, align 4, !tbaa !1 + %inc = add nuw nsw i32 %Inner.024, 1 + %exitcond = icmp eq i32 %inc, %Size + br i1 %exitcond, label %for.cond.cleanup.3, label %for.body.4 +} + +!1 = !{!2, !2, i64 0} +!2 = !{!"int", !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C/C++ TBAA"} + + +; Verify LSR produced the expected bitcasts +; CHECK: Code after LSR +; CHECK: [ %2, {{.+}} ] +; CHECK-NEXT: [ %1, {{.+}} ] + +; Verify Loop Exit Values pass replaced these with available GEPs +; CHECK: Code after Loop Exit Values +; CHECK: [ %scevgep9, {{.+}} ] +; CHECK-NEXT: [ %scevgep4, {{.+}} ] Index: test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll =================================================================== --- test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll +++ test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll @@ -210,13 +210,14 @@ ; A9: %.lr.ph ; A9-NOT: lsl.w ; A9-NOT: {{ldr|str|adds|add r}} -; A9: vst1.8 {{.*}} [r{{[0-9]+}}]! +; A9: vst1.8 {{.*}} [{{lr}}]! ; A9-NOT: {{ldr|str|adds|add r}} -; A9: add.w r +; A9: add.w lr ; A9-NOT: {{ldr|str|adds|add r}} -; A9: add.w r +; A9: add.w lr ; A9-NOT: {{ldr|str|adds|add r}} ; A9-NOT: add.w r +; A9-NOT: add.w lr ; A9: bne define hidden void @testNeon(i8* %ref_data, i32 %ref_stride, i32 %limit, <16 x i8>* nocapture %data) nounwind optsize { %1 = icmp sgt i32 %limit, 0 Index: tools/llc/llc.cpp =================================================================== --- tools/llc/llc.cpp +++ tools/llc/llc.cpp @@ -190,6 +190,7 @@ initializeCore(*Registry); initializeCodeGen(*Registry); initializeLoopStrengthReducePass(*Registry); + initializeLoopExitValuesPass(*Registry); initializeLowerIntrinsicsPass(*Registry); initializeUnreachableBlockElimPass(*Registry);