Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -290,6 +290,9 @@ /// target-independent defaults. void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const; + /// \brief Get target-customized default threshold for loop rotation. + unsigned getLoopRotationDefaultThreshold() const; + /// @} /// \name Scalar Target Information @@ -622,6 +625,7 @@ virtual bool isSourceOfDivergence(const Value *V) = 0; virtual bool isLoweredToCall(const Function *F) = 0; virtual void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) = 0; + virtual unsigned getLoopRotationDefaultThreshold() const = 0; virtual bool isLegalAddImmediate(int64_t Imm) = 0; virtual bool isLegalICmpImmediate(int64_t Imm) = 0; virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, @@ -759,6 +763,9 @@ void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) override { return Impl.getUnrollingPreferences(L, UP); } + unsigned getLoopRotationDefaultThreshold() const override { + return Impl.getLoopRotationDefaultThreshold(); + } bool isLegalAddImmediate(int64_t Imm) override { return Impl.isLegalAddImmediate(Imm); } Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -197,6 +197,8 @@ void getUnrollingPreferences(Loop *, TTI::UnrollingPreferences &) {} + unsigned getLoopRotationDefaultThreshold() const { return 16; } + bool isLegalAddImmediate(int64_t Imm) { return false; } bool isLegalICmpImmediate(int64_t Imm) { return false; } Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -273,6 +273,8 @@ UP.PartialThreshold = UP.PartialOptSizeThreshold = MaxOps; } + unsigned getLoopRotationDefaultThreshold() const { return 16; } + /// @} /// \name Vector TTI Implementations Index: include/llvm/Transforms/Scalar.h =================================================================== --- include/llvm/Transforms/Scalar.h +++ include/llvm/Transforms/Scalar.h @@ -15,6 +15,7 @@ #ifndef LLVM_TRANSFORMS_SCALAR_H #define LLVM_TRANSFORMS_SCALAR_H +#include "llvm/ADT/Optional.h" #include namespace llvm { @@ -184,7 +185,7 @@ // // LoopRotate - This pass is a simple loop rotating pass. // -Pass *createLoopRotatePass(int MaxHeaderSize = -1); +Pass *createLoopRotatePass(Optional MaxHeaderSize = None); //===----------------------------------------------------------------------===// // Index: include/llvm/Transforms/Scalar/LoopRotation.h =================================================================== --- include/llvm/Transforms/Scalar/LoopRotation.h +++ include/llvm/Transforms/Scalar/LoopRotation.h @@ -22,9 +22,10 @@ /// A simple loop rotation transformation. class LoopRotatePass : public PassInfoMixin { unsigned MaxHeaderSize; + bool UseDefaultMHS; + public: - LoopRotatePass(); - LoopRotatePass(unsigned MaxHeaderSize) : MaxHeaderSize(MaxHeaderSize) {} + LoopRotatePass(Optional SpecifiedMaxHeaderSize = None); PreservedAnalyses run(Loop &L, AnalysisManager &AM); }; } Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -100,6 +100,10 @@ return TTIImpl->getUnrollingPreferences(L, UP); } +unsigned TargetTransformInfo::getLoopRotationDefaultThreshold() const { + return TTIImpl->getLoopRotationDefaultThreshold(); +} + bool TargetTransformInfo::isLegalAddImmediate(int64_t Imm) const { return TTIImpl->isLegalAddImmediate(Imm); } Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -53,6 +53,7 @@ /// \name Scalar TTI Implementations /// @{ TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); + unsigned getLoopRotationDefaultThreshold() const; /// @} Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -1484,3 +1484,10 @@ // correct. return (CallerBits & CalleeBits) == CalleeBits; } + +unsigned X86TTIImpl::getLoopRotationDefaultThreshold() const { + // The loop rotation threshold equal to 2 shows the best performance and code + // size results on the benchmarks for Lakemont. For other CPUs use the default + // threshold. + return ST->getCPU() == "lakemont" ? 2 : 16; +} Index: lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- lib/Transforms/IPO/PassManagerBuilder.cpp +++ lib/Transforms/IPO/PassManagerBuilder.cpp @@ -248,7 +248,7 @@ return; } // Rotate Loop - disable header duplication at -Oz - MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1)); + MPM.add(createLoopRotatePass(SizeLevel == 2 ? Optional(0) : None)); MPM.add(createLICMPass()); // Hoist loop invariants MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3)); MPM.add(createCFGSimplificationPass()); @@ -477,7 +477,7 @@ // Re-rotate loops in all our loop nests. These may have fallout out of // rotated form due to GVN or other transformations, and the vectorizer relies // on the rotated form. Disable header duplication at -Oz. - MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1)); + MPM.add(createLoopRotatePass(SizeLevel == 2 ? Optional(0) : None)); // Distribute loops to allow partial vectorization. I.e. isolate dependences // into separate loop that would otherwise inhibit vectorization. This is Index: lib/Transforms/Scalar/LoopRotation.cpp =================================================================== --- lib/Transforms/Scalar/LoopRotation.cpp +++ lib/Transforms/Scalar/LoopRotation.cpp @@ -44,8 +44,8 @@ #define DEBUG_TYPE "loop-rotate" static cl::opt -DefaultRotationThreshold("rotation-max-header-size", cl::init(16), cl::Hidden, - cl::desc("The default maximum header size for automatic loop rotation")); +RotationThreshold("rotation-max-header-size", cl::init(16), cl::Hidden, + cl::desc("The maximum header size for automatic loop rotation")); STATISTIC(NumRotated, "Number of loops rotated"); @@ -564,7 +564,12 @@ return MadeChange; } -LoopRotatePass::LoopRotatePass() : MaxHeaderSize(DefaultRotationThreshold) {} +LoopRotatePass::LoopRotatePass(Optional SpecifiedMaxHeaderSize) { + MaxHeaderSize = SpecifiedMaxHeaderSize.hasValue() ? *SpecifiedMaxHeaderSize + : RotationThreshold; + UseDefaultMHS = !SpecifiedMaxHeaderSize.hasValue() && + RotationThreshold.getNumOccurrences() == 0; +} PreservedAnalyses LoopRotatePass::run(Loop &L, AnalysisManager &AM) { auto &FAM = AM.getResult(L).getManager(); @@ -579,7 +584,10 @@ auto *DT = FAM.getCachedResult(*F); auto *SE = FAM.getCachedResult(*F); - bool Changed = iterativelyRotateLoop(&L, MaxHeaderSize, LI, TTI, AC, DT, SE); + bool Changed = iterativelyRotateLoop( + &L, + UseDefaultMHS ? TTI->getLoopRotationDefaultThreshold() : MaxHeaderSize, + LI, TTI, AC, DT, SE); if (!Changed) return PreservedAnalyses::all(); return getLoopPassPreservedAnalyses(); @@ -589,15 +597,18 @@ class LoopRotateLegacyPass : public LoopPass { unsigned MaxHeaderSize; + bool UseDefaultMHS; public: static char ID; // Pass ID, replacement for typeid - LoopRotateLegacyPass(int SpecifiedMaxHeaderSize = -1) : LoopPass(ID) { + LoopRotateLegacyPass(Optional SpecifiedMaxHeaderSize = None) + : LoopPass(ID) { initializeLoopRotateLegacyPassPass(*PassRegistry::getPassRegistry()); - if (SpecifiedMaxHeaderSize == -1) - MaxHeaderSize = DefaultRotationThreshold; - else - MaxHeaderSize = unsigned(SpecifiedMaxHeaderSize); + + MaxHeaderSize = SpecifiedMaxHeaderSize.hasValue() ? *SpecifiedMaxHeaderSize + : RotationThreshold; + UseDefaultMHS = !SpecifiedMaxHeaderSize.hasValue() && + RotationThreshold.getNumOccurrences() == 0; } // LCSSA form makes instruction renaming easier. @@ -620,7 +631,10 @@ auto *SEWP = getAnalysisIfAvailable(); auto *SE = SEWP ? &SEWP->getSE() : nullptr; - return iterativelyRotateLoop(L, MaxHeaderSize, LI, TTI, AC, DT, SE); + return iterativelyRotateLoop(L, UseDefaultMHS + ? TTI->getLoopRotationDefaultThreshold() + : MaxHeaderSize, + LI, TTI, AC, DT, SE); } }; } @@ -634,6 +648,6 @@ INITIALIZE_PASS_END(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops", false, false) -Pass *llvm::createLoopRotatePass(int MaxHeaderSize) { +Pass *llvm::createLoopRotatePass(Optional MaxHeaderSize) { return new LoopRotateLegacyPass(MaxHeaderSize); } Index: test/Transforms/LoopRotate/target-default.ll =================================================================== --- /dev/null +++ test/Transforms/LoopRotate/target-default.ll @@ -0,0 +1,39 @@ +; REQUIRES: asserts +; RUN: opt < %s -march=x86 -mcpu=pentium -S -loop-rotate -debug -debug-only=loop-rotate 2>&1 | FileCheck %s -check-prefix=PENTIUM +; RUN: opt < %s -march=x86 -mcpu=lakemont -S -loop-rotate -debug -debug-only=loop-rotate 2>&1 | FileCheck %s -check-prefix=LMT +; RUN: opt < %s -march=x86 -mcpu=pentium -S -loop-rotate -rotation-max-header-size=0 -debug -debug-only=loop-rotate 2>&1 | FileCheck %s -check-prefix=PENTIUM-OPT +; RUN: opt < %s -march=x86 -mcpu=lakemont -S -loop-rotate -rotation-max-header-size=16 -debug -debug-only=loop-rotate 2>&1 | FileCheck %s -check-prefix=LMT-OPT + +; Loop should be rotated for Pentium but not for Lakemont. +; PENTIUM: rotating Loop at depth 1 +; LMT-NOT: rotating Loop at depth 1 + +; Specification of -rotation-max-header-size should suppress default +; target threshold. +; PENTIUM-OPT-NOT: rotating Loop at depth 1 +; LMT-OPT: rotating Loop at depth 1 + +target triple = "i386-unknown-linux-gnu" + +declare void @use(i32*, i32) + +define void @test(i32* %x, i32 %y) { +entry: + br label %for.cond + +for.cond: + %x.addr.0 = phi i32* [ %x, %entry ], [ %incdec.ptr, %for.body ] + %0 = load i32, i32* %x.addr.0, align 4 + %cmp = icmp sgt i32 %0, 0 + %cmp1 = icmp sgt i32 %y, 0 + %or.cond = and i1 %cmp, %cmp1 + br i1 %or.cond, label %for.body, label %for.end + +for.body: + tail call void @use(i32* %x.addr.0, i32 %y) + %incdec.ptr = getelementptr inbounds i32, i32* %x.addr.0, i64 1 + br label %for.cond + +for.end: + ret void +}