Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -254,6 +254,7 @@ void initializeStackProtectorPass(PassRegistry&); void initializeStackColoringPass(PassRegistry&); void initializeStackSlotColoringPass(PassRegistry&); +void initializeStraightLineStrengthReducePass(PassRegistry &); void initializeStripDeadDebugInfoPass(PassRegistry&); void initializeStripDeadPrototypesPassPass(PassRegistry&); void initializeStripDebugDeclarePass(PassRegistry&); Index: include/llvm/LinkAllPasses.h =================================================================== --- include/llvm/LinkAllPasses.h +++ include/llvm/LinkAllPasses.h @@ -167,6 +167,7 @@ (void) llvm::createScalarizerPass(); (void) llvm::createSeparateConstOffsetFromGEPPass(); (void) llvm::createRewriteSymbolsPass(); + (void) llvm::createStraightLineStrengthReducePass(); (void)new llvm::IntervalPartition(); (void)new llvm::ScalarEvolution(); Index: include/llvm/Transforms/Scalar.h =================================================================== --- include/llvm/Transforms/Scalar.h +++ include/llvm/Transforms/Scalar.h @@ -412,6 +412,9 @@ // BasicBlockPass *createLoadCombinePass(); +FunctionPass * +createStraightLineStrengthReducePass(const TargetMachine *TM = nullptr); + } // End llvm namespace #endif Index: lib/Target/NVPTX/NVPTXTargetMachine.cpp =================================================================== --- lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -160,6 +160,7 @@ addPass(createNVPTXAssignValidGlobalNamesPass()); addPass(createGenericToNVVMPass()); addPass(createNVPTXFavorNonGenericAddrSpacesPass()); + addPass(createStraightLineStrengthReducePass(TM)); addPass(createSeparateConstOffsetFromGEPPass()); // The SeparateConstOffsetFromGEP pass creates variadic bases that can be used // by multiple GEPs. Run GVN or EarlyCSE to really reuse them. GVN generates Index: lib/Transforms/Scalar/CMakeLists.txt =================================================================== --- lib/Transforms/Scalar/CMakeLists.txt +++ lib/Transforms/Scalar/CMakeLists.txt @@ -38,6 +38,7 @@ SeparateConstOffsetFromGEP.cpp SimplifyCFGPass.cpp Sink.cpp + StraightLineStrengthReduce.cpp StructurizeCFG.cpp TailRecursionElimination.cpp ) Index: lib/Transforms/Scalar/Scalar.cpp =================================================================== --- lib/Transforms/Scalar/Scalar.cpp +++ lib/Transforms/Scalar/Scalar.cpp @@ -69,6 +69,7 @@ initializeSinkingPass(Registry); initializeTailCallElimPass(Registry); initializeSeparateConstOffsetFromGEPPass(Registry); + initializeStraightLineStrengthReducePass(Registry); initializeLoadCombinePass(Registry); } Index: lib/Transforms/Scalar/StraightLineStrengthReduce.cpp =================================================================== --- /dev/null +++ lib/Transforms/Scalar/StraightLineStrengthReduce.cpp @@ -0,0 +1,284 @@ +//===-- StraightLineStrengthReduce.cpp - ------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements straight-line strength reduction (SLSR). Unlike loop +// strength reduction, this algorithm is designed to reduce arithmetic +// redundancy in straight-line code instead of loops. It has proven to be +// effective in simplifying arithmetic statements derived from an unrolled loop. +// It can also simplify the logic of SeparateConstOffsetFromGEP. +// +// There are many optimizations we can perform in the domain of SLSR. This file +// for now contains only an initial step. Specifically, we look for strength +// reduction candidate in the form of +// +// (B + i) * S +// +// where B and S are integer constants or variables, and i is a constant +// integer. If we found two such candidates +// +// S1: X = (B + i) * S S2: Y = (B + i') * S +// +// and S1 dominates S2, we call S1 a basis of S2, and can replace S2 with +// +// Y = X + (i' - i) * S +// +// where (i' - i) * S is folded to the extent possible. When S2 has multiple +// bases, we pick the one that is closest to S2, or S2's "immediate" basis. +// +// TODO(jingyue): +// +// - Handle candidates in the form of B + i * S +// +// - Handle candidates in the form of pointer arithmetics. e.g., B[i * S] +// +// - Floating point arithmetics when fast math is enabled. +#include + +#include "llvm/ADT/DenseSet.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include "llvm/Transforms/Scalar.h" + +using namespace llvm; +using namespace PatternMatch; + +namespace { + +class StraightLineStrengthReduce : public FunctionPass { + public: + // SLSR candidate. Such a candidate must be in the form of + // (Base + Index) * Stride + struct Candidate : public ilist_node { + Candidate(Value *B = nullptr, ConstantInt *Idx = nullptr, + Value *S = nullptr, Instruction *I = nullptr) + : Base(B), Index(Idx), Stride(S), Ins(I), Basis(nullptr) {} + Value *Base; + ConstantInt *Index; + Value *Stride; + // The instruction this candidate corresponds to. It helps us to rewrite a + // candidate with respect to its immediate basis. Note that one instruction + // can corresponds to multiple candidates depending on how you associate the + // expression. For instance, + // + // (a + 1) * (b + 2) + // + // can be treated as + // + // + // + // or + // + // + Instruction *Ins; + // Points to the immediate basis of this candidate, or nullptr if we cannot + // find any basis for this candidate. + Candidate *Basis; + }; + + static char ID; + + StraightLineStrengthReduce(const TargetMachine *TM = nullptr) + : FunctionPass(ID), TM(TM), DT(nullptr) { + initializeStraightLineStrengthReducePass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + // We do not modify the shape of the CFG. + AU.setPreservesCFG(); + } + + bool runOnFunction(Function &F) override; + + private: + // Returns true if Basis is a basis for C, i.e., Basis dominates C and they + // share the same base and stride. + bool isBasisFor(const Candidate &Basis, const Candidate &C); + // Checks whether I is in a candidate form. If so, adds all the matching forms + // to Candidates, and tries to find the immediate basis for each of them. + void allocateCandidateAndFindBasis(Instruction *I); + // Given that I is in the form of "(B + Idx) * S", adds this form to + // Candidates, and finds its immediate basis. + void allocateCandidateAndFindBasis(Value *B, ConstantInt *Idx, Value *S, + Instruction *I); + // Rewrites candidate C with respect to Basis. + void rewriteCandidateWithBasis(const Candidate &C, const Candidate &Basis); + + const TargetMachine *TM; + DominatorTree *DT; + ilist Candidates; + // Temporarily holds all instructions that are unlinked (but not deleted) by + // rewriteCandidateWithBasis. These instructions will be actually removed + // after all rewriting finishes. + DenseSet UnlinkedInstructions; +}; +} // anonymous namespace + +char StraightLineStrengthReduce::ID = 0; +INITIALIZE_PASS_BEGIN(StraightLineStrengthReduce, "slsr", + "Straight line strength reduction", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(StraightLineStrengthReduce, "slsr", + "Straight line strength reduction", false, false) + +FunctionPass * +llvm::createStraightLineStrengthReducePass(const TargetMachine *TM) { + return new StraightLineStrengthReduce(TM); +} + +bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis, + const Candidate &C) { + return (Basis.Ins != C.Ins && // skip the same instruction + // Basis must dominate C in order to rewrite C with respect to Basis. + DT->dominates(Basis.Ins->getParent(), C.Ins->getParent()) && + // They share the same base and stride. + Basis.Base == C.Base && + Basis.Stride == C.Stride); +} + +// TODO(jingyue): We currently implement an algorithm whose time complexity is +// linear to the number of existing candidates. However, a better algorithm +// exists. We could depth-first search the dominator tree, and maintain a hash +// table that contains all candidates that dominate the node being traversed. +// This hash table is indexed by the base and the stride of a candidate. +// Therefore, finding the immediate basis of a candidate boils down to one +// hash-table look up. +void StraightLineStrengthReduce::allocateCandidateAndFindBasis(Value *B, + ConstantInt *Idx, + Value *S, + Instruction *I) { + Candidate C(B, Idx, S, I); + // Try to compute the immediate basis of C. + unsigned NumIterations = 0; + // Limit the scan radius to avoid running forever. + static const int MaxNumIterations = 50; + for (auto Basis = Candidates.rbegin(); + Basis != Candidates.rend() && NumIterations < MaxNumIterations; + ++Basis, ++NumIterations) { + if (isBasisFor(*Basis, C)) { + C.Basis = &(*Basis); + break; + } + } + // Regardless of whether we find a basis for C, we need to push C to the + // candidate list. + Candidates.push_back(C); +} + +void StraightLineStrengthReduce::allocateCandidateAndFindBasis(Instruction *I) { + Value *B = nullptr; + ConstantInt *Idx = nullptr; + // "(Base + Index) * Stride" must be a Mul instruction at the first hand. + if (I->getOpcode() == Instruction::Mul) { + if (IntegerType *ITy = dyn_cast(I->getType())) { + Value *LHS = I->getOperand(0), *RHS = I->getOperand(1); + for (unsigned Swapped = 0; Swapped < 2; ++Swapped) { + // Only handle the canonical operand ordering. + if (match(LHS, m_Add(m_Value(B), m_ConstantInt(Idx)))) { + // If LHS is in the form of "Base + Index", then I is in the form of + // "(Base + Index) * RHS". + allocateCandidateAndFindBasis(B, Idx, RHS, I); + } else { + // Otherwise, at least try the form (LHS + 0) * RHS. + allocateCandidateAndFindBasis(LHS, ConstantInt::get(ITy, 0), RHS, I); + } + // Swap LHS and RHS so that we also cover the cases where LHS is the + // stride. + if (LHS == RHS) + break; + std::swap(LHS, RHS); + } + } + } +} + +void StraightLineStrengthReduce::rewriteCandidateWithBasis( + const Candidate &C, const Candidate &Basis) { + // An instruction can correspond to multiple candidates. Therefore, instead of + // simply deleting an instruction when we rewrite it, we mark its parent as + // nullptr (i.e. unlink it) so that we can skip the candidates whose + // instruction is already rewritten. + if (!C.Ins->getParent()) + return; + assert(C.Base == Basis.Base && C.Stride == Basis.Stride); + // Basis = (B + i) * S + // C = (B + i') * S + // ==> + // C = Basis + (i' - i) * S + IRBuilder<> Builder(C.Ins); + ConstantInt *IndexOffset = ConstantInt::get( + C.Ins->getContext(), C.Index->getValue() - Basis.Index->getValue()); + Value *Reduced; + // TODO: preserve nsw/nuw in some cases. + if (IndexOffset->isOne()) { + // If (i' - i) is 1, fold C into Basis + S. + Reduced = Builder.CreateAdd(Basis.Ins, C.Stride); + } else if (IndexOffset->isMinusOne()) { + // If (i' - i) is -1, fold C into Basis - S. + Reduced = Builder.CreateSub(Basis.Ins, C.Stride); + } else { + Value *Bump = Builder.CreateMul(C.Stride, IndexOffset); + Reduced = Builder.CreateAdd(Basis.Ins, Bump); + } + Reduced->takeName(C.Ins); + C.Ins->replaceAllUsesWith(Reduced); + C.Ins->dropAllReferences(); + // Unlink C.Ins so that we can skip other candidates also corresponding to + // C.Ins. The actual deletion is postponed to the end of runOnFunction. + C.Ins->removeFromParent(); + UnlinkedInstructions.insert(C.Ins); +} + +bool StraightLineStrengthReduce::runOnFunction(Function &F) { + if (skipOptnoneFunction(F)) + return false; + + // SLSR may decrease ILP. Therefore, we disable it when the target schedules + // for ILP. + if (TM) { + const TargetLowering *TL = TM->getSubtargetImpl(F)->getTargetLowering(); + if (!TL || TL->getSchedulingPreference() == Sched::ILP) + return false; + } + + DT = &getAnalysis().getDomTree(); + // Traverse the dominator tree in the depth-first order. This order makes sure + // all bases of a candidate are in Candidates when we process it. + for (auto node = GraphTraits::nodes_begin(DT); + node != GraphTraits::nodes_end(DT); ++node) { + BasicBlock *B = node->getBlock(); + for (auto I = B->begin(); I != B->end(); ++I) { + allocateCandidateAndFindBasis(I); + } + } + + // Rewrite candidates in the reverse depth-first order. This order makes sure + // a candidate being rewritten is not a basis for any other candidate. + while (!Candidates.empty()) { + const Candidate &C = Candidates.back(); + if (C.Basis != nullptr) { + rewriteCandidateWithBasis(C, *C.Basis); + } + Candidates.pop_back(); + } + + // Delete all unlink instructions. + for (auto I : UnlinkedInstructions) { + delete I; + } + bool Ret = !UnlinkedInstructions.empty(); + UnlinkedInstructions.clear(); + return Ret; +} Index: test/Transforms/StraightLineStrengthReduce/slsr.ll =================================================================== --- /dev/null +++ test/Transforms/StraightLineStrengthReduce/slsr.ll @@ -0,0 +1,119 @@ +; RUN: opt < %s -slsr -gvn -dce -S | FileCheck %s + +declare i32 @foo(i32 %a) + +define i32 @slsr1(i32 %b, i32 %s) { +; CHECK-LABEL: @slsr1( + ; v0 = foo(b * s); + %mul0 = mul i32 %b, %s +; CHECK: mul i32 +; CHECK-NOT: mul i32 + %v0 = call i32 @foo(i32 %mul0) + + ; v1 = foo((b + 1) * s); + %b1 = add i32 %b, 1 + %mul1 = mul i32 %b1, %s + %v1 = call i32 @foo(i32 %mul1) + + ; v2 = foo((b + 2) * s); + %b2 = add i32 %b, 2 + %mul2 = mul i32 %b2, %s + %v2 = call i32 @foo(i32 %mul2) + + ; return v0 + v1 + v2; + %1 = add i32 %v0, %v1 + %2 = add i32 %1, %v2 + ret i32 %2 +} + +; v0 = foo(a * b) +; v1 = foo((a + 1) * b) +; v2 = foo(a * (b + 1)) +; v3 = foo((a + 1) * (b + 1)) +define i32 @slsr2(i32 %a, i32 %b) { +; CHECK-LABEL: @slsr2( + %a1 = add i32 %a, 1 + %b1 = add i32 %b, 1 + %mul0 = mul i32 %a, %b +; CHECK: mul i32 +; CHECK-NOT: mul i32 + %mul1 = mul i32 %a1, %b + %mul2 = mul i32 %a, %b1 + %mul3 = mul i32 %a1, %b1 + + %v0 = call i32 @foo(i32 %mul0) + %v1 = call i32 @foo(i32 %mul1) + %v2 = call i32 @foo(i32 %mul2) + %v3 = call i32 @foo(i32 %mul3) + + %1 = add i32 %v0, %v1 + %2 = add i32 %1, %v2 + %3 = add i32 %2, %v3 + ret i32 %3 +} + +; The bump is a multiple of the stride. +; +; v0 = foo(b * s); +; v1 = foo((b + 2) * s); +; v2 = foo((b + 4) * s); +; return v0 + v1 + v2; +; +; ==> +; +; mul0 = b * s; +; v0 = foo(mul0); +; bump = s * 2; +; mul1 = mul0 + bump; // GVN ensures mul1 and mul2 use the same bump. +; v1 = foo(mul1); +; mul2 = mul1 + bump; +; v2 = foo(mul2); +; return v0 + v1 + v2; +define i32 @slsr3(i32 %b, i32 %s) { +; CHECK-LABEL: @slsr3( + %mul0 = mul i32 %b, %s +; CHECK: mul i32 + %v0 = call i32 @foo(i32 %mul0) + + %b1 = add i32 %b, 2 + %mul1 = mul i32 %b1, %s +; CHECK: [[BUMP:%[a-zA-Z0-9]+]] = mul i32 %s, 2 +; CHECK: %mul1 = add i32 %mul0, [[BUMP]] + %v1 = call i32 @foo(i32 %mul1) + + %b2 = add i32 %b, 4 + %mul2 = mul i32 %b2, %s +; CHECK: %mul2 = add i32 %mul1, [[BUMP]] + %v2 = call i32 @foo(i32 %mul2) + + %1 = add i32 %v0, %v1 + %2 = add i32 %1, %v2 + ret i32 %2 +} + +; Do not rewrite a candidate if its potential basis does not dominate it. +; v0 = 0; +; if (cond) +; v0 = foo(a * b); +; v1 = foo((a + 1) * b); +; return v0 + v1; +define i32 @not_dominate(i1 %cond, i32 %a, i32 %b) { +; CHECK-LABEL: @not_dominate( +entry: + %a1 = add i32 %a, 1 + br i1 %cond, label %then, label %merge + +then: + %mul0 = mul i32 %a, %b +; CHECK: %mul0 = mul i32 %a, %b + %v0 = call i32 @foo(i32 %mul0) + br label %merge + +merge: + %v0.phi = phi i32 [ 0, %entry ], [ %mul0, %then ] + %mul1 = mul i32 %a1, %b +; CHECK: %mul1 = mul i32 %a1, %b + %v1 = call i32 @foo(i32 %mul1) + %sum = add i32 %v0.phi, %v1 + ret i32 %sum +}