Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -252,7 +252,7 @@ void initializeRegisterCoalescerPass(PassRegistry&); void initializeSingleLoopExtractorPass(PassRegistry&); void initializeSinkingPass(PassRegistry&); -void initializeSeparateConstOffsetFromGEPPass(PassRegistry &); +void initializeReassociateGEPsPass(PassRegistry &); void initializeSlotIndexesPass(PassRegistry&); void initializeSpillPlacementPass(PassRegistry&); void initializeStackProtectorPass(PassRegistry&); Index: include/llvm/LinkAllPasses.h =================================================================== --- include/llvm/LinkAllPasses.h +++ include/llvm/LinkAllPasses.h @@ -169,7 +169,7 @@ (void) llvm::createBBVectorizePass(); (void) llvm::createPartiallyInlineLibCallsPass(); (void) llvm::createScalarizerPass(); - (void) llvm::createSeparateConstOffsetFromGEPPass(); + (void)llvm::createReassociateGEPsPass(); (void) llvm::createRewriteSymbolsPass(); (void) llvm::createStraightLineStrengthReducePass(); (void) llvm::createMemDerefPrinter(); Index: include/llvm/Transforms/Scalar.h =================================================================== --- include/llvm/Transforms/Scalar.h +++ include/llvm/Transforms/Scalar.h @@ -415,11 +415,10 @@ //===----------------------------------------------------------------------===// // -// SeparateConstOffsetFromGEP - Split GEPs for better CSE +// ReassociateGEPs - Split and reassociate GEPs for better CSE and LICM. // -FunctionPass * -createSeparateConstOffsetFromGEPPass(const TargetMachine *TM = nullptr, - bool LowerGEP = false); +FunctionPass *createReassociateGEPsPass(const TargetMachine *TM = nullptr, + bool LowerGEP = false); //===----------------------------------------------------------------------===// // Index: lib/Target/AArch64/AArch64TargetMachine.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetMachine.cpp +++ lib/Target/AArch64/AArch64TargetMachine.cpp @@ -228,10 +228,10 @@ TargetPassConfig::addIRPasses(); if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) { - // Call SeparateConstOffsetFromGEP pass to extract constants within indices + // Call ReassociateGEPs pass to extract constants within indices // and lower a GEP with multiple indices to either arithmetic operations or // multiple GEPs with single index. - addPass(createSeparateConstOffsetFromGEPPass(TM, true)); + addPass(createReassociateGEPsPass(TM, true)); // Call EarlyCSE pass to find and remove subexpressions in the lowered // result. addPass(createEarlyCSEPass()); Index: lib/Target/NVPTX/NVPTXTargetMachine.cpp =================================================================== --- lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -165,22 +165,23 @@ addPass(createGenericToNVVMPass()); addPass(createNVPTXFavorNonGenericAddrSpacesPass()); addPass(createStraightLineStrengthReducePass()); - addPass(createSeparateConstOffsetFromGEPPass()); - // The SeparateConstOffsetFromGEP pass creates variadic bases that can be used - // by multiple GEPs. Run GVN or EarlyCSE to really reuse them. GVN generates - // significantly better code than EarlyCSE for some of our benchmarks. + addPass(createReassociateGEPsPass()); + // The ReassociateGEPs pass creates variadic bases that can be used + // by multiple GEPs. Run GVN or EarlyCSE to really reuse them. GVN + // generates significantly better code than EarlyCSE for some of our + // benchmarks. if (getOptLevel() == CodeGenOpt::Aggressive) addPass(createGVNPass()); else addPass(createEarlyCSEPass()); - // Both FavorNonGenericAddrSpaces and SeparateConstOffsetFromGEP may leave + // Both FavorNonGenericAddrSpaces and ReassociateGEPs may leave // some dead code. We could remove dead code in an ad-hoc manner, but that // requires manual work and might be error-prone. // // The FavorNonGenericAddrSpaces pass shortcuts unnecessary addrspacecasts, // and leave them unused. // - // SeparateConstOffsetFromGEP rebuilds a new index from the old index, and the + // ReassociateGEPs rebuilds a new index from the old index, and the // old index and some of its intermediate results may become unused. addPass(createDeadCodeEliminationPass()); } Index: lib/Target/PowerPC/PPCTargetMachine.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetMachine.cpp +++ lib/Target/PowerPC/PPCTargetMachine.cpp @@ -263,10 +263,10 @@ addPass(createPPCLoopDataPrefetchPass()); if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) { - // Call SeparateConstOffsetFromGEP pass to extract constants within indices + // Call ReassociateGEPs pass to extract constants within indices // and lower a GEP with multiple indices to either arithmetic operations or // multiple GEPs with single index. - addPass(createSeparateConstOffsetFromGEPPass(TM, true)); + addPass(createReassociateGEPsPass(TM, true)); // Call EarlyCSE pass to find and remove subexpressions in the lowered // result. addPass(createEarlyCSEPass()); Index: lib/Transforms/Scalar/CMakeLists.txt =================================================================== --- lib/Transforms/Scalar/CMakeLists.txt +++ lib/Transforms/Scalar/CMakeLists.txt @@ -33,6 +33,7 @@ PartiallyInlineLibCalls.cpp PlaceSafepoints.cpp Reassociate.cpp + ReassociateGEPs.cpp Reg2Mem.cpp RewriteStatepointsForGC.cpp SCCP.cpp @@ -41,7 +42,6 @@ Scalar.cpp ScalarReplAggregates.cpp Scalarizer.cpp - SeparateConstOffsetFromGEP.cpp SimplifyCFGPass.cpp Sink.cpp StraightLineStrengthReduce.cpp Index: lib/Transforms/Scalar/ReassociateGEPs.cpp =================================================================== --- lib/Transforms/Scalar/ReassociateGEPs.cpp +++ lib/Transforms/Scalar/ReassociateGEPs.cpp @@ -1,4 +1,4 @@ -//===-- SeparateConstOffsetFromGEP.cpp - ------------------------*- C++ -*-===// +//===-- ReassociateGEPs.cpp - ------------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -7,6 +7,20 @@ // //===----------------------------------------------------------------------===// // +// This pass reassociates the operands of GEP instructions and then +// splits the GEPs to enable later optimizations such as loop +// invariant code motion, constant folding, and common subexpression +// elimination. Specifically, this pass performs the following +// transformations: +// +// (1) Reassociate and gather constant offsets in GEP sequential index operands. +// (2) Reassociate loop invariant terms in GEP sequential index operands. +// (3) (Optional) Lower GEPs into simpler GEPs or their arithmetic equivalents. +// +// Below are the details of the motivation and mechanics of each transformation: +// +// (1) Reassociate and gather constant offsets +// // Loop unrolling may create many similar GEPs for array accesses. // e.g., a 2-level loop // @@ -79,10 +93,36 @@ // ld.global.f32 %f3, [%rl6+128]; // much better // ld.global.f32 %f4, [%rl6+132]; // much better // -// Another improvement enabled by the LowerGEP flag is to lower a GEP with -// multiple indices to either multiple GEPs with a single index or arithmetic -// operations (depending on whether the target uses alias analysis in codegen). -// Such transformation can have following benefits: +// (2) Reassociate loop invariant terms +// +// If the base pointer of a GEP is loop invariant and some values in +// GEP index expressions are also loop invariant it may be possible to +// split the GEP into two GEPs, one of which is loop invariant. +// Example: +// +// loop: +// %sum = add %a, %invariant_b +// %idx = add %sum, %c +// %p = gep %invariant_base, %idx +// +// Where %invariant_b and %invariant_base are loop invariant. This +// GEP can be split into two GEPs: +// +// loop: +// %tmp = gep %invariant_base, %invariant_b +// %idx = add %a, %c +// %p = gep %tmp, %idx +// +// The first GEP (%tmp) is now invariant and can be hoisted out of the +// loop. GEPs with multiple sequential index operands might be split +// more than once. +// +// (3) Lower GEPs +// +// If the LowerGEP option is set GEPs with multiple indices are +// lowered to either multiple GEPs with a single index or arithmetic +// operations (depending on whether the target uses alias analysis in +// codegen). Such transformation can have following benefits: // (1) It can always extract constants in the indices of structure type. // (2) After such Lowering, there are more optimization opportunities such as // CSE, LICM and CGP. @@ -145,17 +185,16 @@ // load %p2 // ... // -// Lowering GEPs can also benefit other passes such as LICM and CGP. -// LICM (Loop Invariant Code Motion) can not hoist/sink a GEP of multiple -// indices if one of the index is variant. If we lower such GEP into invariant -// parts and variant parts, LICM can hoist/sink those invariant parts. -// CGP (CodeGen Prepare) tries to sink address calculations that match the -// target's addressing modes. A GEP with multiple indices may not match and will -// not be sunk. If we lower such GEP into smaller parts, CGP may sink some of -// them. So we end up with a better addressing mode. +// Lowering GEPs can also benefit other passes such as CGP. CGP +// (CodeGen Prepare) tries to sink address calculations that match the +// target's addressing modes. A GEP with multiple indices may not +// match and will not be sunk. If we lower such GEP into smaller +// parts, CGP may sink some of them. So we end up with a better +// addressing mode. // //===----------------------------------------------------------------------===// +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Constants.h" @@ -173,79 +212,217 @@ using namespace llvm; -static cl::opt DisableSeparateConstOffsetFromGEP( - "disable-separate-const-offset-from-gep", cl::init(false), - cl::desc("Do not separate the constant offset from a GEP instruction"), - cl::Hidden); +static cl::opt + DisableReassociateGEPs("disable-reassociate-geps", cl::init(false), + cl::desc("Do not reassociate GEP instructions"), + cl::Hidden); namespace { -/// \brief A helper class for separating a constant offset from a GEP index. +/// \brief Element in the def-use chain used for identifying and +/// extracting hoistable values from an expression. +struct UserChainElement { + // The value in the def-use chain. + Value *value; + // For values with more than one operand, this is the operand number + // for this link in the chain. + unsigned operandNo; +}; + +/// \brief A helper class for finding values in an expression which +/// may be hoisted to the outermost level of the expression. /// -/// In real programs, a GEP index may be more complicated than a simple addition -/// of something and a constant integer which can be trivially splitted. For -/// example, to split ((a << 3) | 5) + b, we need to search deeper for the -/// constant offset, so that we can separate the index to (a << 3) + b and 5. +/// When reassociating a GEP index expression for LICM or for +/// extracting constant offsets we want to find a value within the +/// expression which may be safely hoisted to the outermost level of +/// the expression as an addition. For example, the constant 5 can be +/// hoisted out of the expression ((a << 3) | 5) + b to form a new +/// equivalent expression ((a << 3) + b) + 5. /// -/// Therefore, this class looks into the expression that computes a given GEP -/// index, and tries to find a constant integer that can be hoisted to the -/// outermost level of the expression as an addition. Not every constant in an -/// expression can jump out. e.g., we cannot transform (b * (a + 5)) to (b * a + -/// 5); nor can we transform (3 * (a + 5)) to (3 * a + 5), however in this case, -/// -instcombine probably already optimized (3 * (a + 5)) to (3 * a + 15). -class ConstantOffsetExtractor { - public: - /// Extracts a constant offset from the given GEP index. It returns the - /// new index representing the remainder (equal to the original index minus - /// the constant offset), or nullptr if we cannot extract a constant offset. - /// \p Idx The given GEP index - /// \p GEP The given GEP - static Value *Extract(Value *Idx, GetElementPtrInst *GEP); - /// Looks for a constant offset from the given GEP index without extracting - /// it. It returns the numeric value of the extracted constant offset (0 if - /// failed). The meaning of the arguments are the same as Extract. - static int64_t Find(Value *Idx, GetElementPtrInst *GEP); - - private: - ConstantOffsetExtractor(Instruction *InsertionPt) : IP(InsertionPt) {} - /// Searches the expression that computes V for a non-zero constant C s.t. - /// V can be reassociated into the form V' + C. If the searching is - /// successful, returns C and update UserChain as a def-use chain from C to V; - /// otherwise, UserChain is empty. +/// This abstract class includes logic to selectively visit values +/// within an expression which are safe to hoist. Subclasses implement +/// the visit method to select among these hoistable values based on +/// subclass-specific objectives such as loop invariance or whether +/// the value is a constant. +class HoistableValueFinder { +public: + HoistableValueFinder(const DataLayout &DLayout) : DL(DLayout) {} + virtual ~HoistableValueFinder() {} + /// Walks the expression tree of Expression visiting each hoistable value. + /// Returns true if a hoistable value is found as indicated by the + /// visit method. The resulting UserChain vector may be fed to + /// ValueExtractor::Extract to extract the hoistable value from the + /// expression. + /// + /// \p Expression The given expression + /// \p NonNegative Whether the expression is guaranteed to be nonnegative + /// This may uncover more hoistable values. + /// \p UserChain If a hoistable value is found, this is filled with + /// the def-use chain of the hoistable value from the + /// hoistable value up to the full expression. + bool find(Value *Expression, bool NonNegative, + SmallVectorImpl &UserChain); + +protected: + /// This method is called with each hoistable value found during + /// the walk of the expression tree performed in find(). This method + /// should return true if the hoistable value is what the subclass + /// is looking for (for example, a constant value for + /// ConstantOffsetFinder). ContinueSearch should be set to true or + /// false depending on whether the search should continue beneath + /// this value. + virtual bool visit(Value *V, bool &ContinueSearch) = 0; + +private: + /// Searches the expression that computes V for a value W s.t. V + /// can be reassociated into the form V' + W where V' is V with W + /// removed. If the searching is successful, returns true and + /// updates UserChain as a def-use chain from W to V; otherwise, + /// UserChain is empty. /// /// \p V The given expression /// \p SignExtended Whether V will be sign-extended in the computation of the - /// GEP index + /// top-level expression /// \p ZeroExtended Whether V will be zero-extended in the computation of the - /// GEP index + /// top-level expression /// \p NonNegative Whether V is guaranteed to be non-negative. For example, /// an index of an inbounds GEP is guaranteed to be - /// non-negative. Levaraging this, we can better split + /// non-negative. Leveraging this, we can better split /// inbounds GEPs. - APInt find(Value *V, bool SignExtended, bool ZeroExtended, bool NonNegative); + bool findInSubexpression(Value *V, bool SignExtended, bool ZeroExtended, + bool NonNegative, + SmallVectorImpl &UserChain); /// A helper function to look into both operands of a binary operator. - APInt findInEitherOperand(BinaryOperator *BO, bool SignExtended, - bool ZeroExtended); - /// After finding the constant offset C from the GEP index I, we build a new - /// index I' s.t. I' + C = I. This function builds and returns the new - /// index I' according to UserChain produced by function "find". + bool findInEitherOperand(BinaryOperator *BO, bool SignExtended, + bool ZeroExtended, unsigned &Operand, + SmallVectorImpl &UserChain); + /// Returns true if LHS and RHS have no bits in common, i.e., LHS | RHS == 0. + bool noCommonBits(Value *LHS, Value *RHS) const; + /// Computes which bits are known to be one or zero. + /// \p KnownOne Mask of all bits that are known to be one. + /// \p KnownZero Mask of all bits that are known to be zero. + void computeKnownBits(Value *V, APInt &KnownOne, APInt &KnownZero) const; + /// A helper function that returns whether we can trace into the operands + /// of binary operator BO for a constant offset. + /// + /// \p SignExtended Whether BO is surrounded by sext + /// \p ZeroExtended Whether BO is surrounded by zext + /// \p NonNegative Whether BO is known to be non-negative, e.g., an in-bound + /// array index. + bool canTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO, + bool NonNegative); + + const DataLayout &DL; +}; + +// \brief A subclass of HoistableValueFinder for finding hoistable +// constants within an expression. +class ConstantOffsetFinder : HoistableValueFinder { +public: + /// Looks for a constant offset within the given GEP index + /// expression. It returns the numeric value of the extracted + /// constant offset (0 if failed). If a constant is found, + /// UserChain is set to the def-use chain for this value which can + /// then be fed to ValueExtractor::Extract to extract the constant + /// from the expression. + static int64_t FindConstOffset(GetElementPtrInst *GEP, unsigned OperandIdx, + SmallVectorImpl &UserChain); + +protected: + virtual bool visit(Value *V, bool &ContinueSearch); + +private: + ConstantOffsetFinder(const DataLayout &DLayout) + : HoistableValueFinder(DLayout), ConstantOffset(1, 0) {} + + APInt ConstantOffset; +}; + +/// \brief A helper class which ranks values according to their loop invariance. +/// +/// The rank of a value is the minimal loop depth that the value can be +/// hoisted up to. Constants and function arguments, for example, have +/// rank 0. +class LICMRanker { +public: + LICMRanker(const LoopInfo *Info) : LI(Info) {} + int getRank(Value *V); + +private: + // Returns true if an instruction cannot be moved. An immovable + // instruction's rank is simply its loop depth. + static bool isImmovableInstruction(Instruction *I); + + const LoopInfo *LI; + // Cache of already computed ranks. + DenseMap RankMap; +}; + +/// \brief A subclass of HoistableValueFinder for finding a loop +/// invariant value within an index expression of a GEP. +class LoopInvariantFinder : HoistableValueFinder { +public: + // Looks for a loop invariant value in the expression for the given + // GEP index. Returns true if a value is found, and UserChain is + // set to the def-use chain of the invariant value. UserChain can + // then be fed to ValueExtractor::Extract. + static bool + FindLoopInvariantValue(GetElementPtrInst *GEP, unsigned OperandIdx, + LICMRanker &Ranker, + SmallVectorImpl &UserChain); + +protected: + virtual bool visit(Value *V, bool &ContinueSearch); + +private: + LoopInvariantFinder(int BR, int MR, LICMRanker &R, const DataLayout &DLayout) + : HoistableValueFinder(DLayout), BaseRank(BR), MaximumRank(MR), + Candidate(nullptr), Ranker(R) {} + // The rank of the expression(s) which the loop invariant value will + // be reassociated with. + int BaseRank; + // The maximum rank of any value to be considered as a profitable candidate. + int MaximumRank; + // Best candidate found so far. + Value *Candidate; + LICMRanker &Ranker; +}; + +/// \brief A helper class for extracting a hoistable value (as defined +/// by HoistableValueFinder) from an expression. +class ValueExtractor { +public: + /// Extracts the value indicated by UserChain from the given + /// expression. UserChain is generated by + /// HoistableValueFinder::Find. Returns the extracted + /// value. Remainder is set to the remaining expression without the + /// extracted value. + static Value *Extract(SmallVectorImpl &UserChain, + Instruction *InsertionPoint, Value *&Remainder); + +private: + ValueExtractor(Instruction *InsertionPoint) : IP(InsertionPoint) {} + /// After finding a hoistable value V within an expression E, we build a new + /// expression E' s.t. E' + V = I. This function builds and returns the new + /// expression E'. /// /// The building conceptually takes two steps: /// 1) iteratively distribute s/zext towards the leaves of the expression tree - /// that computes I - /// 2) reassociate the expression tree to the form I' + C. + /// that computes E + /// 2) reassociate the expression tree to the form E' + V. /// /// For example, to extract the 5 from sext(a + (b + 5)), we first distribute /// sext to a, b and 5 so that we have /// sext(a) + (sext(b) + 5). /// Then, we reassociate it to /// (sext(a) + sext(b)) + 5. - /// Given this form, we know I' is sext(a) + sext(b). - Value *rebuildWithoutConstOffset(); - /// After the first step of rebuilding the GEP index without the constant - /// offset, distribute s/zext to the operands of all operators in UserChain. - /// e.g., zext(sext(a + (b + 5)) (assuming no overflow) => - /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))). + /// Given this form, we know E' is sext(a) + sext(b). + Value *rebuildWithoutValue(SmallVectorImpl &UserChain, + Value *&Remainder); + /// After the first step of rebuilding the expression without the + /// hoistable value, distribute s/zext to the operands of all + /// operators in UserChain. e.g., zext(sext(a + (b + 5)) (assuming + /// no overflow) => zext(sext(a)) + (zext(sext(b)) + zext(sext(5))). /// /// The function also updates UserChain to point to new subexpressions after /// distributing s/zext. e.g., the old UserChain of the above example is @@ -257,90 +434,77 @@ /// \p ChainIndex The index to UserChain. ChainIndex is initially /// UserChain.size() - 1, and is decremented during /// the recursion. - Value *distributeExtsAndCloneChain(unsigned ChainIndex); - /// Reassociates the GEP index to the form I' + C and returns I'. - Value *removeConstOffset(unsigned ChainIndex); + Value * + distributeExtsAndCloneChain(SmallVectorImpl &UserChain, + unsigned ChainIndex); + /// Reassociates the expression to the form E' + V and returns E'. + Value *removeHoistableValue(SmallVectorImpl &UserChain, + unsigned ChainIndex); /// A helper function to apply ExtInsts, a list of s/zext, to value V. /// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function /// returns "sext i32 (zext i16 V to i32) to i64". Value *applyExts(Value *V); - /// Returns true if LHS and RHS have no bits in common, i.e., LHS | RHS == 0. - bool NoCommonBits(Value *LHS, Value *RHS) const; - /// Computes which bits are known to be one or zero. - /// \p KnownOne Mask of all bits that are known to be one. - /// \p KnownZero Mask of all bits that are known to be zero. - void ComputeKnownBits(Value *V, APInt &KnownOne, APInt &KnownZero) const; - /// A helper function that returns whether we can trace into the operands - /// of binary operator BO for a constant offset. - /// - /// \p SignExtended Whether BO is surrounded by sext - /// \p ZeroExtended Whether BO is surrounded by zext - /// \p NonNegative Whether BO is known to be non-negative, e.g., an in-bound - /// array index. - bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO, - bool NonNegative); - - /// The path from the constant offset to the old GEP index. e.g., if the GEP - /// index is "a * b + (c + 5)". After running function find, UserChain[0] will - /// be the constant 5, UserChain[1] will be the subexpression "c + 5", and - /// UserChain[2] will be the entire expression "a * b + (c + 5)". - /// - /// This path helps to rebuild the new GEP index. - SmallVector UserChain; /// A data structure used in rebuildWithoutConstOffset. Contains all /// sext/zext instructions along UserChain. SmallVector ExtInsts; - Instruction *IP; /// Insertion position of cloned instructions. + Instruction *IP; /// Insertion position of cloned instructions. }; -/// \brief A pass that tries to split every GEP in the function into a variadic -/// base and a constant offset. It is a FunctionPass because searching for the -/// constant offset may inspect other basic blocks. -class SeparateConstOffsetFromGEP : public FunctionPass { - public: +/// \brief A pass that tries to split GEP instructions to enable later +/// optimization opportunities. It is a FunctionPass because searching +/// for the GEP index subexpressions may inspect other basic blocks. +class ReassociateGEPs : public FunctionPass { +public: static char ID; - SeparateConstOffsetFromGEP(const TargetMachine *TM = nullptr, - bool LowerGEP = false) + ReassociateGEPs(const TargetMachine *TM = nullptr, bool LowerGEP = false) : FunctionPass(ID), TM(TM), LowerGEP(LowerGEP) { - initializeSeparateConstOffsetFromGEPPass(*PassRegistry::getPassRegistry()); + initializeReassociateGEPsPass(*PassRegistry::getPassRegistry()); } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addPreserved(); AU.addRequired(); AU.setPreservesCFG(); } bool runOnFunction(Function &F) override; - private: - /// Tries to split the given GEP into a variadic base and a constant offset, - /// and returns true if the splitting succeeds. - bool splitGEP(GetElementPtrInst *GEP); - /// Lower a GEP with multiple indices into multiple GEPs with a single index. - /// Function splitGEP already split the original GEP into a variadic part and - /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the - /// variadic part into a set of GEPs with a single index and applies - /// AccumulativeByteOffset to it. - /// \p Variadic The variadic part of the original GEP. - /// \p AccumulativeByteOffset The constant offset. - void lowerToSingleIndexGEPs(GetElementPtrInst *Variadic, - int64_t AccumulativeByteOffset); - /// Lower a GEP with multiple indices into ptrtoint+arithmetics+inttoptr form. - /// Function splitGEP already split the original GEP into a variadic part and - /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the - /// variadic part into a set of arithmetic operations and applies - /// AccumulativeByteOffset to it. - /// \p Variadic The variadic part of the original GEP. - /// \p AccumulativeByteOffset The constant offset. - void lowerToArithmetics(GetElementPtrInst *Variadic, - int64_t AccumulativeByteOffset); +private: + /// Tries to split the given GEP in two different ways. First, we + /// try to split the given GEP into a variadic base and a constant + /// offset. Second, we try to split the GEP into a loop invariant + /// and loop variant GEP. Returns true if the IR is changed. + bool splitGEP(GetElementPtrInst *GEP, LICMRanker &Ranker); + /// Performs the same transformation as splitGEP but also lowers a + /// GEP into either multiple single-index GEPs or arithmetic + /// operands. Unlike in splitGEP the initial splitting for constant + /// aggregation also considers indices of structure type. Returns + /// true if the IR is changed. + bool splitAndLowerGEP(GetElementPtrInst *GEP, LICMRanker &Ranker); + /// Removes hoistable constants from the index operand expressions of a GEP. + void stripConstantsFromGEP(GetElementPtrInst *GEP); + /// Tries to split the GEP to enable loop invariant code motion. + /// Ranker is used to rank values based on loop invariance. Upon + /// return, AddedGEPs includes all GEPs created by splitting. The + /// original GEP may be split more than once. Returns true if the IR + /// is changed. + bool splitGEPForLICM(GetElementPtrInst *GEP, LICMRanker &Ranker, + SmallVectorImpl &AddedGEPs); + /// Lower a GEP with multiple indices into multiple GEPs with a + /// single index and adds in Offset (in bytes). + void lowerToSingleIndexGEPs(GetElementPtrInst *GEP, int64_t Offset); + /// Lowers a GEP into ptrtoint+arithmetics+inttoptr form and adds in + /// Offset (in bytes). + void lowerToArithmetics(GetElementPtrInst *GEP, int64_t Offset); /// Finds the constant offset within each index and accumulates them. If /// LowerGEP is true, it finds in indices of both sequential and structure /// types, otherwise it only finds in sequential indices. The output /// NeedsExtraction indicates whether we successfully find a non-zero constant /// offset. - int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction); + int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction, + bool accumulateFieldOffsets); /// Canonicalize array indices to pointer-size integers. This helps to /// simplify the logic of splitting a GEP. For example, if a + b is a /// pointer-size integer, we have @@ -363,29 +527,25 @@ /// multiple GEPs with a single index. bool LowerGEP; }; -} // anonymous namespace +} // anonymous namespace -char SeparateConstOffsetFromGEP::ID = 0; -INITIALIZE_PASS_BEGIN( - SeparateConstOffsetFromGEP, "separate-const-offset-from-gep", - "Split GEPs to a variadic base and a constant offset for better CSE", false, - false) +char ReassociateGEPs::ID = 0; +INITIALIZE_PASS_BEGIN(ReassociateGEPs, "reassociate-geps", + "Reassociate and split GEPs for better LICM and CSE", + false, false) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_END( - SeparateConstOffsetFromGEP, "separate-const-offset-from-gep", - "Split GEPs to a variadic base and a constant offset for better CSE", false, - false) - -FunctionPass * -llvm::createSeparateConstOffsetFromGEPPass(const TargetMachine *TM, - bool LowerGEP) { - return new SeparateConstOffsetFromGEP(TM, LowerGEP); +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_END(ReassociateGEPs, "reassociate-geps", + "Reassociate and split GEPs for better LICM and CSE", false, + false) + +FunctionPass *llvm::createReassociateGEPsPass(const TargetMachine *TM, + bool LowerGEP) { + return new ReassociateGEPs(TM, LowerGEP); } -bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended, - bool ZeroExtended, - BinaryOperator *BO, - bool NonNegative) { +bool HoistableValueFinder::canTraceInto(bool SignExtended, bool ZeroExtended, + BinaryOperator *BO, bool NonNegative) { // We only consider ADD, SUB and OR, because a non-zero constant found in // expressions composed of these operations can be easily hoisted as a // constant offset by reassociation. @@ -398,7 +558,7 @@ Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1); // Do not trace into "or" unless it is equivalent to "add". If LHS and RHS // don't have common bits, (LHS | RHS) is equivalent to (LHS + RHS). - if (BO->getOpcode() == Instruction::Or && !NoCommonBits(LHS, RHS)) + if (BO->getOpcode() == Instruction::Or && !noCommonBits(LHS, RHS)) return false; // In addition, tracing into BO requires that its surrounding s/zext (if @@ -444,70 +604,238 @@ return true; } -APInt ConstantOffsetExtractor::findInEitherOperand(BinaryOperator *BO, - bool SignExtended, - bool ZeroExtended) { - // BO being non-negative does not shed light on whether its operands are - // non-negative. Clear the NonNegative flag here. - APInt ConstantOffset = find(BO->getOperand(0), SignExtended, ZeroExtended, - /* NonNegative */ false); - // If we found a constant offset in the left operand, stop and return that. - // This shortcut might cause us to miss opportunities of combining the - // constant offsets in both operands, e.g., (a + 4) + (b + 5) => (a + b) + 9. - // However, such cases are probably already handled by -instcombine, - // given this pass runs after the standard optimizations. - if (ConstantOffset != 0) return ConstantOffset; - ConstantOffset = find(BO->getOperand(1), SignExtended, ZeroExtended, - /* NonNegative */ false); - // If U is a sub operator, negate the constant offset found in the right - // operand. - if (BO->getOpcode() == Instruction::Sub) - ConstantOffset = -ConstantOffset; - return ConstantOffset; -} - -APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended, - bool ZeroExtended, bool NonNegative) { - // TODO(jingyue): We could trace into integer/pointer casts, such as - // inttoptr, ptrtoint, bitcast, and addrspacecast. We choose to handle only - // integers because it gives good enough results for our benchmarks. - unsigned BitWidth = cast(V->getType())->getBitWidth(); - - // We cannot do much with Values that are not a User, such as an Argument. - User *U = dyn_cast(V); - if (U == nullptr) return APInt(BitWidth, 0); +bool HoistableValueFinder::findInEitherOperand( + BinaryOperator *BO, bool SignExtended, bool ZeroExtended, unsigned &Operand, + SmallVectorImpl &UserChain) { + bool Found = false; + for (unsigned Op = 0; Op < 2; ++Op) { + // BO being non-negative does not shed light on whether its operands are + // non-negative. Clear the NonNegative flag here. + if (findInSubexpression(BO->getOperand(Op), SignExtended, ZeroExtended, + /* NonNegative */ false, UserChain)) { + Found = true; + Operand = Op; + } + } + return Found; +} + +bool HoistableValueFinder::findInSubexpression( + Value *V, bool SignExtended, bool ZeroExtended, bool NonNegative, + SmallVectorImpl &UserChain) { + bool ContinueSearch; + bool Found = visit(V, ContinueSearch); + if (Found) + UserChain.clear(); + + unsigned Operand = 0; + if (ContinueSearch) { + // We cannot do much with Values that are not a User, such as an Argument. + if (User *U = dyn_cast(V)) { + if (BinaryOperator *BO = dyn_cast(V)) { + // Trace into subexpressions for more hoisting opportunities. + if (canTraceInto(SignExtended, ZeroExtended, BO, NonNegative)) { + Found |= findInEitherOperand(BO, SignExtended, ZeroExtended, Operand, + UserChain); + } + } else if (isa(V)) { + Found |= findInSubexpression(U->getOperand(0), /* SignExtended */ true, + ZeroExtended, NonNegative, UserChain); + } else if (isa(V)) { + // As an optimization, we can clear the SignExtended flag because + // sext(zext(a)) = zext(a). Verified in @sext_zext in split-gep.ll. + // + // Clear the NonNegative flag, because zext(a) >= 0 does not imply + // a >= 0. + Found |= findInSubexpression(U->getOperand(0), /* SignExtended */ false, + /* ZeroExtended */ true, + /* NonNegative */ false, UserChain); + } + } + } + + if (Found) + UserChain.push_back({V, Operand}); + + return Found; +} + +bool HoistableValueFinder::find(Value *Expression, bool NonNegative, + SmallVectorImpl &UserChain) { + UserChain.clear(); + return (findInSubexpression(Expression, /* SignExtended */ false, + /* ZeroExtended */ false, NonNegative, + UserChain)); +} + +void HoistableValueFinder::computeKnownBits(Value *V, APInt &KnownOne, + APInt &KnownZero) const { + IntegerType *IT = cast(V->getType()); + KnownOne = APInt(IT->getBitWidth(), 0); + KnownZero = APInt(IT->getBitWidth(), 0); + llvm::computeKnownBits(V, KnownZero, KnownOne, DL, 0); +} + +bool HoistableValueFinder::noCommonBits(Value *LHS, Value *RHS) const { + assert(LHS->getType() == RHS->getType() && + "LHS and RHS should have the same type"); + APInt LHSKnownOne, LHSKnownZero, RHSKnownOne, RHSKnownZero; + computeKnownBits(LHS, LHSKnownOne, LHSKnownZero); + computeKnownBits(RHS, RHSKnownOne, RHSKnownZero); + return (LHSKnownZero | RHSKnownZero).isAllOnesValue(); +} + +int64_t ConstantOffsetFinder::FindConstOffset( + GetElementPtrInst *GEP, unsigned OperandIdx, + SmallVectorImpl &UserChain) { + ConstantOffsetFinder Finder(GEP->getModule()->getDataLayout()); + // Indices of an inbound GEP are guaranteed to be non-negative. + if (!Finder.find(GEP->getOperand(OperandIdx), GEP->isInBounds(), UserChain)) + return 0; + + // Sign and zero extend the constant as determined by UserChain. + // Also invert the constant if it appears on the RHS of a subtract. + APInt ExtValue = Finder.ConstantOffset; + for (unsigned i = 0; i < UserChain.size(); ++i) { + Value *V = UserChain[i].value; + unsigned BitWidth = cast(V->getType())->getBitWidth(); + if (isa(V)) + ExtValue = ExtValue.sext(BitWidth); + else if (isa(V)) + ExtValue = ExtValue.zext(BitWidth); + else if (BinaryOperator *BO = dyn_cast(V)) { + if ((BO->getOpcode() == Instruction::Sub) && + (UserChain[i].operandNo == 1)) + ExtValue = -ExtValue; + } + } + return ExtValue.getSExtValue(); +} + +bool ConstantOffsetFinder::visit(Value *V, bool &ContinueSearch) { + // Don't keep searching if we've already found a constant. This + // shortcut might cause us to miss opportunities of combining the + // constant offsets in both operands, e.g., (a + 4) + (b + 5) => (a + // + b) + 9. However, such cases are probably already handled by + // -instcombine, given this pass runs after the standard + // optimizations. + if (ConstantOffset != 0) { + ContinueSearch = false; + return false; + } - APInt ConstantOffset(BitWidth, 0); if (ConstantInt *CI = dyn_cast(V)) { // Hooray, we found it! ConstantOffset = CI->getValue(); - } else if (BinaryOperator *BO = dyn_cast(V)) { - // Trace into subexpressions for more hoisting opportunities. - if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative)) { - ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended); - } - } else if (isa(V)) { - ConstantOffset = find(U->getOperand(0), /* SignExtended */ true, - ZeroExtended, NonNegative).sext(BitWidth); - } else if (isa(V)) { - // As an optimization, we can clear the SignExtended flag because - // sext(zext(a)) = zext(a). Verified in @sext_zext in split-gep.ll. - // - // Clear the NonNegative flag, because zext(a) >= 0 does not imply a >= 0. - ConstantOffset = - find(U->getOperand(0), /* SignExtended */ false, - /* ZeroExtended */ true, /* NonNegative */ false).zext(BitWidth); + if (ConstantOffset != 0) { + ContinueSearch = false; + return true; + } + } + + ContinueSearch = true; + return false; +} + +bool LICMRanker::isImmovableInstruction(Instruction *I) { + switch (I->getOpcode()) { + case Instruction::PHI: + case Instruction::LandingPad: + case Instruction::Alloca: + case Instruction::Load: + case Instruction::Invoke: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Call: + return true; + default: + return false; + } +} + +int LICMRanker::getRank(Value *V) { + Instruction *I = dyn_cast(V); + if (!I) + return 0; + + if (RankMap.find(I) != RankMap.end()) + return RankMap[I]; + + int Rank = 0; + if (Loop *L = LI->getLoopFor(I->getParent())) { + int MaxRank = L->getLoopDepth(); + if (isImmovableInstruction(I)) + Rank = MaxRank; + else { + for (unsigned i = 0, e = I->getNumOperands(); i != e && Rank < MaxRank; + ++i) + Rank = std::max(Rank, getRank(I->getOperand(i))); + Rank = std::min(MaxRank, Rank); + } + } + RankMap[I] = Rank; + return Rank; +} + +bool LoopInvariantFinder::FindLoopInvariantValue( + GetElementPtrInst *GEP, unsigned OperandIdx, LICMRanker &Ranker, + SmallVectorImpl &UserChain) { + int BaseRank = 0; + gep_type_iterator GTI = gep_type_begin(*GEP); + for (unsigned I = 0; I < OperandIdx; ++I, ++GTI) { + if (isa(*GTI)) + BaseRank = std::max(BaseRank, Ranker.getRank(GEP->getOperand(I))); } - // If we found a non-zero constant offset, add it to the path for - // rebuildWithoutConstOffset. Zero is a valid constant offset, but doesn't - // help this optimization. - if (ConstantOffset != 0) - UserChain.push_back(U); - return ConstantOffset; + Value *GEPOperand = GEP->getOperand(OperandIdx); + // If the operand as a whole is at least as invariant as the base + // then reassociation and splitting at this operand will not enable + // any further LICM. + if (Ranker.getRank(GEPOperand) <= BaseRank) + return false; + + LoopInvariantFinder Finder(BaseRank, Ranker.getRank(GEPOperand) - 1, Ranker, + GEP->getModule()->getDataLayout()); + // Indices of an inbound GEP are guaranteed to be non-negative. + return Finder.find(GEPOperand, GEP->isInBounds(), UserChain); +} + +bool LoopInvariantFinder::visit(Value *V, bool &ContinueSearch) { + // Considerations for candidate values: + // 1) We want the value to be as loop invariant as possible. + // 2) We don't want to break up an expression which is more loop + // invariant (lower rank) than the expression the value will be + // reassociated with (indicated by BaseRank). + // 3) We don't want to duplicate any code unnecessarily so only recurse + // into expressions which are not used elsewhere. + bool Found = false; + if ((Ranker.getRank(V) <= MaximumRank) && + (!Candidate || (Ranker.getRank(V) < Ranker.getRank(Candidate)))) { + Candidate = V; + Found = true; + } + + // To avoid duplicating code during extraction, don't recurse into + // expressions which have more than one use. Also, don't recurse + // into expressions which are already loop invariant (at least as + // loop invariant as BaseRank). + ContinueSearch = true; + if (!V->hasOneUse() || (Ranker.getRank(V) <= BaseRank)) + ContinueSearch = false; + else if (BinaryOperator *BO = dyn_cast(V)) { + // Don't recurse into subtract expressions. + if (BO->getOpcode() == Instruction::Sub) + ContinueSearch = false; + } + + return Found; } -Value *ConstantOffsetExtractor::applyExts(Value *V) { +Value *ValueExtractor::applyExts(Value *V) { Value *Current = V; // ExtInsts is built in the use-def order. Therefore, we apply them to V // in the reversed order. @@ -526,71 +854,79 @@ return Current; } -Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() { - distributeExtsAndCloneChain(UserChain.size() - 1); +Value *ValueExtractor::rebuildWithoutValue( + SmallVectorImpl &UserChain, Value *&ExtractedValue) { + assert(UserChain.size() > 0); + ExtInsts.clear(); + distributeExtsAndCloneChain(UserChain, UserChain.size() - 1); + // Remove all nullptrs (used to be s/zext) from UserChain. unsigned NewSize = 0; for (auto I = UserChain.begin(), E = UserChain.end(); I != E; ++I) { - if (*I != nullptr) { + if (I->value != nullptr) { UserChain[NewSize] = *I; NewSize++; } } UserChain.resize(NewSize); - return removeConstOffset(UserChain.size() - 1); + ExtractedValue = UserChain[0].value; + return removeHoistableValue(UserChain, UserChain.size() - 1); } -Value * -ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) { - User *U = UserChain[ChainIndex]; +Value *ValueExtractor::distributeExtsAndCloneChain( + SmallVectorImpl &UserChain, unsigned ChainIndex) { + Value *V = UserChain[ChainIndex].value; if (ChainIndex == 0) { - assert(isa(U)); - // If U is a ConstantInt, applyExts will return a ConstantInt as well. - return UserChain[ChainIndex] = cast(applyExts(U)); + UserChain[ChainIndex] = {applyExts(V), 0}; + return UserChain[ChainIndex].value; } - if (CastInst *Cast = dyn_cast(U)) { + if (CastInst *Cast = dyn_cast(V)) { assert((isa(Cast) || isa(Cast)) && "We only traced into two types of CastInst: sext and zext"); ExtInsts.push_back(Cast); - UserChain[ChainIndex] = nullptr; - return distributeExtsAndCloneChain(ChainIndex - 1); + UserChain[ChainIndex] = {nullptr, 0}; + return distributeExtsAndCloneChain(UserChain, ChainIndex - 1); } - // Function find only trace into BinaryOperator and CastInst. - BinaryOperator *BO = cast(U); - // OpNo = which operand of BO is UserChain[ChainIndex - 1] - unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1); - Value *TheOther = applyExts(BO->getOperand(1 - OpNo)); - Value *NextInChain = distributeExtsAndCloneChain(ChainIndex - 1); + // Function find only traces into BinaryOperator and CastInst. + BinaryOperator *BO = cast(V); + Value *TheOther = + applyExts(BO->getOperand(1 - UserChain[ChainIndex].operandNo)); + Value *NextInChain = distributeExtsAndCloneChain(UserChain, ChainIndex - 1); BinaryOperator *NewBO = nullptr; - if (OpNo == 0) { + if (UserChain[ChainIndex].operandNo == 0) NewBO = BinaryOperator::Create(BO->getOpcode(), NextInChain, TheOther, BO->getName(), IP); - } else { + else NewBO = BinaryOperator::Create(BO->getOpcode(), TheOther, NextInChain, BO->getName(), IP); - } - return UserChain[ChainIndex] = NewBO; -} -Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { - if (ChainIndex == 0) { - assert(isa(UserChain[ChainIndex])); - return ConstantInt::getNullValue(UserChain[ChainIndex]->getType()); - } + UserChain[ChainIndex] = {NewBO, UserChain[ChainIndex].operandNo}; + return UserChain[ChainIndex].value; +} - BinaryOperator *BO = cast(UserChain[ChainIndex]); - unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1); - assert(BO->getOperand(OpNo) == UserChain[ChainIndex - 1]); - Value *NextInChain = removeConstOffset(ChainIndex - 1); - Value *TheOther = BO->getOperand(1 - OpNo); +Value *ValueExtractor::removeHoistableValue( + SmallVectorImpl &UserChain, unsigned ChainIndex) { + if (ChainIndex == 0) + return ConstantInt::getNullValue(UserChain[ChainIndex].value->getType()); + + // S/Zext operations have been removed from the chain so only binary + // operators remain. + BinaryOperator *BO = cast(UserChain[ChainIndex].value); + assert(BO->getOperand(UserChain[ChainIndex].operandNo) == + UserChain[ChainIndex - 1].value && + "Invalid operand number in UserChain"); + Value *NextInChain = removeHoistableValue(UserChain, ChainIndex - 1); + Value *TheOther = BO->getOperand(1 - UserChain[ChainIndex].operandNo); // If NextInChain is 0 and not the LHS of a sub, we can simplify the // sub-expression to be just TheOther. if (ConstantInt *CI = dyn_cast(NextInChain)) { - if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0)) + if (CI->isZero() && + !(BO->getOpcode() == Instruction::Sub && + UserChain[ChainIndex].operandNo == 0)) return TheOther; } @@ -608,7 +944,7 @@ // // Replacing the "or" with "add" is fine, because // a | (b + 5) = a + (b + 5) = (a + b) + 5 - if (OpNo == 0) { + if (UserChain[ChainIndex].operandNo == 0) { return BinaryOperator::CreateAdd(NextInChain, TheOther, BO->getName(), IP); } else { @@ -623,7 +959,7 @@ "distributeExtsAndCloneChain clones each BinaryOperator in " "UserChain, so no one should be used more than " "once"); - BO->setOperand(OpNo, NextInChain); + BO->setOperand(UserChain[ChainIndex].operandNo, NextInChain); BO->setHasNoSignedWrap(false); BO->setHasNoUnsignedWrap(false); // Make sure it appears after all instructions we've inserted so far. @@ -631,45 +967,15 @@ return BO; } -Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP) { - ConstantOffsetExtractor Extractor(GEP); - // Find a non-zero constant offset first. - APInt ConstantOffset = - Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false, - GEP->isInBounds()); - if (ConstantOffset == 0) - return nullptr; - // Separates the constant offset from the GEP index. - return Extractor.rebuildWithoutConstOffset(); -} - -int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP) { - // If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative. - return ConstantOffsetExtractor(GEP) - .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false, - GEP->isInBounds()) - .getSExtValue(); -} - -void ConstantOffsetExtractor::ComputeKnownBits(Value *V, APInt &KnownOne, - APInt &KnownZero) const { - IntegerType *IT = cast(V->getType()); - KnownOne = APInt(IT->getBitWidth(), 0); - KnownZero = APInt(IT->getBitWidth(), 0); - const DataLayout &DL = IP->getModule()->getDataLayout(); - llvm::computeKnownBits(V, KnownZero, KnownOne, DL, 0); -} - -bool ConstantOffsetExtractor::NoCommonBits(Value *LHS, Value *RHS) const { - assert(LHS->getType() == RHS->getType() && - "LHS and RHS should have the same type"); - APInt LHSKnownOne, LHSKnownZero, RHSKnownOne, RHSKnownZero; - ComputeKnownBits(LHS, LHSKnownOne, LHSKnownZero); - ComputeKnownBits(RHS, RHSKnownOne, RHSKnownZero); - return (LHSKnownZero | RHSKnownZero).isAllOnesValue(); +Value *ValueExtractor::Extract(SmallVectorImpl &UserChain, + Instruction *InsertionPoint, Value *&Remainder) { + ValueExtractor Extractor(InsertionPoint); + Value *Extracted; + Remainder = Extractor.rebuildWithoutValue(UserChain, Extracted); + return Extracted; } -bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToPointerSize( +bool ReassociateGEPs::canonicalizeArrayIndicesToPointerSize( GetElementPtrInst *GEP) { bool Changed = false; const DataLayout &DL = GEP->getModule()->getDataLayout(); @@ -688,27 +994,28 @@ return Changed; } -int64_t -SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP, - bool &NeedsExtraction) { +int64_t ReassociateGEPs::accumulateByteOffset(GetElementPtrInst *GEP, + bool &NeedsExtraction, + bool accumulateFieldOffsets) { NeedsExtraction = false; int64_t AccumulativeByteOffset = 0; gep_type_iterator GTI = gep_type_begin(*GEP); const DataLayout &DL = GEP->getModule()->getDataLayout(); + SmallVector UserChain; for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) { if (isa(*GTI)) { // Tries to extract a constant offset from this GEP index. int64_t ConstantOffset = - ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP); + ConstantOffsetFinder::FindConstOffset(GEP, I, UserChain); if (ConstantOffset != 0) { NeedsExtraction = true; - // A GEP may have multiple indices. We accumulate the extracted + // A GEP may have multiple indices. We accumulate the extracted // constant offset to a byte offset, and later offset the remainder of // the original GEP with this byte offset. AccumulativeByteOffset += ConstantOffset * DL.getTypeAllocSize(GTI.getIndexedType()); } - } else if (LowerGEP) { + } else if (accumulateFieldOffsets) { StructType *StTy = cast(*GTI); uint64_t Field = cast(GEP->getOperand(I))->getZExtValue(); // Skip field 0 as the offset is always 0. @@ -722,24 +1029,24 @@ return AccumulativeByteOffset; } -void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs( - GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset) { - IRBuilder<> Builder(Variadic); - const DataLayout &DL = Variadic->getModule()->getDataLayout(); - Type *IntPtrTy = DL.getIntPtrType(Variadic->getType()); +void ReassociateGEPs::lowerToSingleIndexGEPs(GetElementPtrInst *GEP, + int64_t Offset) { + IRBuilder<> Builder(GEP); + const DataLayout &DL = GEP->getModule()->getDataLayout(); + Type *IntPtrTy = DL.getIntPtrType(GEP->getType()); Type *I8PtrTy = - Builder.getInt8PtrTy(Variadic->getType()->getPointerAddressSpace()); - Value *ResultPtr = Variadic->getOperand(0); + Builder.getInt8PtrTy(GEP->getType()->getPointerAddressSpace()); + Value *ResultPtr = GEP->getOperand(0); if (ResultPtr->getType() != I8PtrTy) ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy); - gep_type_iterator GTI = gep_type_begin(*Variadic); + gep_type_iterator GTI = gep_type_begin(*GEP); // Create an ugly GEP for each sequential index. We don't create GEPs for // structure indices, as they are accumulated in the constant offset index. - for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) { + for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) { if (isa(*GTI)) { - Value *Idx = Variadic->getOperand(I); + Value *Idx = GEP->getOperand(I); // Skip zero indices. if (ConstantInt *CI = dyn_cast(Idx)) if (CI->isZero()) @@ -757,37 +1064,38 @@ } } // Create an ugly GEP with a single index for each index. - ResultPtr = Builder.CreateGEP(ResultPtr, Idx, "uglygep"); + ResultPtr = + Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Idx, "uglygep"); } } // Create a GEP with the constant offset index. - if (AccumulativeByteOffset != 0) { - Value *Offset = ConstantInt::get(IntPtrTy, AccumulativeByteOffset); - ResultPtr = Builder.CreateGEP(ResultPtr, Offset, "uglygep"); - } - if (ResultPtr->getType() != Variadic->getType()) - ResultPtr = Builder.CreateBitCast(ResultPtr, Variadic->getType()); - - Variadic->replaceAllUsesWith(ResultPtr); - Variadic->eraseFromParent(); -} - -void -SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic, - int64_t AccumulativeByteOffset) { - IRBuilder<> Builder(Variadic); - const DataLayout &DL = Variadic->getModule()->getDataLayout(); - Type *IntPtrTy = DL.getIntPtrType(Variadic->getType()); + if (Offset != 0) { + Value *OffsetValue = ConstantInt::get(IntPtrTy, Offset); + ResultPtr = Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, OffsetValue, + "uglygep"); + } + if (ResultPtr->getType() != GEP->getType()) + ResultPtr = Builder.CreateBitCast(ResultPtr, GEP->getType()); + + GEP->replaceAllUsesWith(ResultPtr); + GEP->eraseFromParent(); +} - Value *ResultPtr = Builder.CreatePtrToInt(Variadic->getOperand(0), IntPtrTy); - gep_type_iterator GTI = gep_type_begin(*Variadic); +void ReassociateGEPs::lowerToArithmetics(GetElementPtrInst *GEP, + int64_t Offset) { + IRBuilder<> Builder(GEP); + const DataLayout &DL = GEP->getModule()->getDataLayout(); + Type *IntPtrTy = DL.getIntPtrType(GEP->getType()); + + Value *ResultPtr = Builder.CreatePtrToInt(GEP->getOperand(0), IntPtrTy); + gep_type_iterator GTI = gep_type_begin(*GEP); // Create ADD/SHL/MUL arithmetic operations for each sequential indices. We // don't create arithmetics for structure indices, as they are accumulated // in the constant offset index. - for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) { + for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) { if (isa(*GTI)) { - Value *Idx = Variadic->getOperand(I); + Value *Idx = GEP->getOperand(I); // Skip zero indices. if (ConstantInt *CI = dyn_cast(Idx)) if (CI->isZero()) @@ -810,51 +1118,91 @@ } // Create an ADD for the constant offset index. - if (AccumulativeByteOffset != 0) { - ResultPtr = Builder.CreateAdd( - ResultPtr, ConstantInt::get(IntPtrTy, AccumulativeByteOffset)); + if (Offset != 0) { + ResultPtr = + Builder.CreateAdd(ResultPtr, ConstantInt::get(IntPtrTy, Offset)); } - ResultPtr = Builder.CreateIntToPtr(ResultPtr, Variadic->getType()); - Variadic->replaceAllUsesWith(ResultPtr); - Variadic->eraseFromParent(); + ResultPtr = Builder.CreateIntToPtr(ResultPtr, GEP->getType()); + GEP->replaceAllUsesWith(ResultPtr); + GEP->eraseFromParent(); } -bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { - // Skip vector GEPs. - if (GEP->getType()->isVectorTy()) - return false; +// Split the GEP at the specified index. IdxPart1 and IdxPart2 are +// the two components of the index operand at which the gep is split. +// For example, with: +// %idx = add %a, %b +// = gep %base, 42, %idx, %foo, %bar +// The gep can be split at operand 2 (%idx) with IdxPart1 equal to %a +// and IdxPart2 equals to %b. The resultant code is: +// %invgep = gep %base, 42, %a +// = gep %invgep, %b, %foo, %bar +// The potential advantage is that %invgep may be loop invariant and can +// be hoisted out of loops. +static std::pair splitGEPAtIndex(GetElementPtrInst *GEP, + unsigned OperandIndex, + Value *IdxPart1, + Value *IdxPart2) { + IRBuilder<> Builder(GEP); + SmallVector GEPOperands; + for (unsigned i = 1; i < OperandIndex; ++i) + GEPOperands.push_back(GEP->getOperand(i)); + GEPOperands.push_back(IdxPart1); + Value *InvariantGEP = + Builder.CreateGEP(GEP->getOperand(0), GEPOperands, "invgep"); + + GEPOperands.clear(); + GEPOperands.push_back(IdxPart2); + for (unsigned i = OperandIndex + 1; i < GEP->getNumOperands(); i++) + GEPOperands.push_back(GEP->getOperand(i)); + Value *RemainderGEP = Builder.CreateGEP(InvariantGEP, GEPOperands, "vargep"); + GEP->replaceAllUsesWith(RemainderGEP); + GEP->eraseFromParent(); - // The backend can already nicely handle the case where all indices are - // constant. - if (GEP->hasAllConstantIndices()) - return false; + return std::make_pair(InvariantGEP, RemainderGEP); +} - bool Changed = canonicalizeArrayIndicesToPointerSize(GEP); +bool ReassociateGEPs::splitGEPForLICM( + GetElementPtrInst *GEP, LICMRanker &Ranker, + SmallVectorImpl &AddedGEPs) { + int GEPRank = Ranker.getRank(GEP); + if (GEPRank == 0) + // GEP is loop invariant. Nothing to do here. + return false; - bool NeedsExtraction; - int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction); + gep_type_iterator GTI = gep_type_begin(*GEP); + for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) { + if (!isa(*GTI)) + continue; - if (!NeedsExtraction) - return Changed; - // If LowerGEP is disabled, before really splitting the GEP, check whether the - // backend supports the addressing mode we are about to produce. If no, this - // splitting probably won't be beneficial. - // If LowerGEP is enabled, even the extracted constant offset can not match - // the addressing mode, we can still do optimizations to other lowered parts - // of variable indices. Therefore, we don't check for addressing modes in that - // case. - if (!LowerGEP) { - TargetTransformInfo &TTI = - getAnalysis().getTTI( - *GEP->getParent()->getParent()); - if (!TTI.isLegalAddressingMode(GEP->getType()->getElementType(), - /*BaseGV=*/nullptr, AccumulativeByteOffset, - /*HasBaseReg=*/true, /*Scale=*/0)) { - return Changed; + SmallVector UserChain; + if (LoopInvariantFinder::FindLoopInvariantValue(GEP, I, Ranker, + UserChain)) { + Value *Remainder = nullptr; + Value *Extracted = ValueExtractor::Extract(UserChain, GEP, Remainder); + std::pair SplitGEPs = + splitGEPAtIndex(GEP, I, Extracted, Remainder); + + // Newly created GEPs may be Constant (not GetElementPtrInst). + if (GetElementPtrInst *NewGEP = + dyn_cast(SplitGEPs.first)) + AddedGEPs.push_back(NewGEP); + // Now try to reassociate the second GEP which may include + // opportunities for further reassociation. + if (GetElementPtrInst *NewGEP = + dyn_cast(SplitGEPs.second)) { + if (!splitGEPForLICM(NewGEP, Ranker, AddedGEPs)) + // If the second GEP was not split, then add it to the vector + // of new GEPs. + AddedGEPs.push_back(NewGEP); + } + return true; } } + return false; +} +void ReassociateGEPs::stripConstantsFromGEP(GetElementPtrInst *GEP) { // Remove the constant offset in each sequential index. The resultant GEP // computes the variadic base. // Notice that we don't remove struct field indices here. If LowerGEP is @@ -867,13 +1215,14 @@ if (isa(*GTI)) { // Splits this GEP index into a variadic part and a constant offset, and // uses the variadic part as the new index. - Value *NewIdx = ConstantOffsetExtractor::Extract(GEP->getOperand(I), GEP); - if (NewIdx != nullptr) { - GEP->setOperand(I, NewIdx); + SmallVector UserChain; + if (ConstantOffsetFinder::FindConstOffset(GEP, I, UserChain)) { + Value *Remainder; + ValueExtractor::Extract(UserChain, GEP, Remainder); + GEP->setOperand(I, Remainder); } } } - // Clear the inbounds attribute because the new index may be off-bound. // e.g., // @@ -894,21 +1243,39 @@ // TODO(jingyue): do some range analysis to keep as many inbounds as // possible. GEPs with inbounds are more friendly to alias analysis. GEP->setIsInBounds(false); +} - // Lowers a GEP to either GEPs with a single index or arithmetic operations. - if (LowerGEP) { - // As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to - // arithmetic operations if the target uses alias analysis in codegen. - if (TM && TM->getSubtargetImpl(*GEP->getParent()->getParent())->useAA()) - lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset); - else - lowerToArithmetics(GEP, AccumulativeByteOffset); - return true; - } +bool ReassociateGEPs::splitGEP(GetElementPtrInst *GEP, LICMRanker &Ranker) { + bool NeedsExtraction; + int64_t AccumulativeByteOffset = + accumulateByteOffset(GEP, NeedsExtraction, false); + + SmallVector AddedGEPs; + if (!NeedsExtraction) + return splitGEPForLICM(GEP, Ranker, AddedGEPs); + + // If LowerGEP is disabled, before really splitting the GEP, check whether the + // backend supports the addressing mode we are about to produce. If no, this + // splitting probably won't be beneficial. + // If LowerGEP is enabled, even the extracted constant offset can not match + // the addressing mode, we can still do optimizations to other lowered parts + // of variable indices. Therefore, we don't check for addressing modes in that + // case. + TargetTransformInfo &TTI = + getAnalysis().getTTI( + *GEP->getParent()->getParent()); + if (!TTI.isLegalAddressingMode(GEP->getType()->getElementType(), + /*BaseGV=*/nullptr, AccumulativeByteOffset, + /*HasBaseReg=*/true, /*Scale=*/0)) + return splitGEPForLICM(GEP, Ranker, AddedGEPs); + + stripConstantsFromGEP(GEP); // No need to create another GEP if the accumulative byte offset is 0. - if (AccumulativeByteOffset == 0) + if (AccumulativeByteOffset == 0) { + splitGEPForLICM(GEP, Ranker, AddedGEPs); return true; + } // Offsets the base with the accumulative byte offset. // @@ -938,7 +1305,7 @@ // %uglygep = gep %0, // %new.gep = bitcast %uglygep to // ... %new.gep ... - Instruction *NewGEP = GEP->clone(); + GetElementPtrInst *NewGEP = cast(GEP->clone()); NewGEP->insertBefore(GEP); // Per ANSI C standard, signed / unsigned = unsigned and signed % unsigned = @@ -948,11 +1315,12 @@ int64_t ElementTypeSizeOfGEP = static_cast( DL.getTypeAllocSize(GEP->getType()->getElementType())); Type *IntPtrTy = DL.getIntPtrType(GEP->getType()); + Instruction *NewDef; if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) { // Very likely. As long as %gep is natually aligned, the byte offset we // extracted should be a multiple of sizeof(*%gep). int64_t Index = AccumulativeByteOffset / ElementTypeSizeOfGEP; - NewGEP = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP, + NewDef = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP, ConstantInt::get(IntPtrTy, Index, true), GEP->getName(), GEP); } else { @@ -970,35 +1338,81 @@ // sizeof(int64). // // Emit an uglygep in this case. - Type *I8PtrTy = Type::getInt8PtrTy(GEP->getContext(), - GEP->getPointerAddressSpace()); - NewGEP = new BitCastInst(NewGEP, I8PtrTy, "", GEP); - NewGEP = GetElementPtrInst::Create( - Type::getInt8Ty(GEP->getContext()), NewGEP, + Type *I8PtrTy = + Type::getInt8PtrTy(GEP->getContext(), GEP->getPointerAddressSpace()); + NewDef = new BitCastInst(NewGEP, I8PtrTy, "", GEP); + NewDef = GetElementPtrInst::Create( + Type::getInt8Ty(GEP->getContext()), NewDef, ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), "uglygep", GEP); if (GEP->getType() != I8PtrTy) - NewGEP = new BitCastInst(NewGEP, GEP->getType(), GEP->getName(), GEP); + NewDef = new BitCastInst(NewDef, GEP->getType(), GEP->getName(), GEP); } - GEP->replaceAllUsesWith(NewGEP); + GEP->replaceAllUsesWith(NewDef); GEP->eraseFromParent(); + splitGEPForLICM(NewGEP, Ranker, AddedGEPs); return true; } -bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) { +bool ReassociateGEPs::splitAndLowerGEP(GetElementPtrInst *GEP, + LICMRanker &Ranker) { + bool NeedsExtraction; + int64_t AccumulativeByteOffset = + accumulateByteOffset(GEP, NeedsExtraction, true); + + SmallVector GEPsToLower; + if (NeedsExtraction) + stripConstantsFromGEP(GEP); + + splitGEPForLICM(GEP, Ranker, GEPsToLower); + if (GEPsToLower.empty()) + GEPsToLower.push_back(GEP); + + for (unsigned i = 0; i < GEPsToLower.size(); ++i) { + GetElementPtrInst *NewGEP = GEPsToLower[i]; + // Only add the byte offset to the last GEP. + int64_t Offset = (i == GEPsToLower.size() - 1) ? AccumulativeByteOffset : 0; + // As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to + // arithmetic operations if the target uses alias analysis in codegen. + if (TM && TM->getSubtargetImpl(*NewGEP->getParent()->getParent())->useAA()) + lowerToSingleIndexGEPs(NewGEP, Offset); + else + lowerToArithmetics(NewGEP, Offset); + } + return true; +} + +bool ReassociateGEPs::runOnFunction(Function &F) { if (skipOptnoneFunction(F)) return false; - if (DisableSeparateConstOffsetFromGEP) + if (DisableReassociateGEPs) return false; + LoopInfo *LI = &getAnalysis().getLoopInfo(); + LICMRanker Ranker(LI); + bool Changed = false; for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) { - for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ) { + for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE;) { if (GetElementPtrInst *GEP = dyn_cast(I++)) { - Changed |= splitGEP(GEP); + // Skip vector GEPs. + if (GEP->getType()->isVectorTy()) + continue; + + // The backend can already nicely handle the case where all indices are + // constant. + if (GEP->hasAllConstantIndices()) + continue; + + Changed |= canonicalizeArrayIndicesToPointerSize(GEP); + + if (LowerGEP) + Changed |= splitAndLowerGEP(GEP, Ranker); + else + Changed |= splitGEP(GEP, Ranker); } // No need to split GEP ConstantExprs because all its indices are constant // already. Index: lib/Transforms/Scalar/Scalar.cpp =================================================================== --- lib/Transforms/Scalar/Scalar.cpp +++ lib/Transforms/Scalar/Scalar.cpp @@ -73,7 +73,7 @@ initializeStructurizeCFGPass(Registry); initializeSinkingPass(Registry); initializeTailCallElimPass(Registry); - initializeSeparateConstOffsetFromGEPPass(Registry); + initializeReassociateGEPsPass(Registry); initializeStraightLineStrengthReducePass(Registry); initializeLoadCombinePass(Registry); initializePlaceBackedgeSafepointsImplPass(Registry); Index: lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp =================================================================== --- lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -1,1010 +0,0 @@ -//===-- SeparateConstOffsetFromGEP.cpp - ------------------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Loop unrolling may create many similar GEPs for array accesses. -// e.g., a 2-level loop -// -// float a[32][32]; // global variable -// -// for (int i = 0; i < 2; ++i) { -// for (int j = 0; j < 2; ++j) { -// ... -// ... = a[x + i][y + j]; -// ... -// } -// } -// -// will probably be unrolled to: -// -// gep %a, 0, %x, %y; load -// gep %a, 0, %x, %y + 1; load -// gep %a, 0, %x + 1, %y; load -// gep %a, 0, %x + 1, %y + 1; load -// -// LLVM's GVN does not use partial redundancy elimination yet, and is thus -// unable to reuse (gep %a, 0, %x, %y). As a result, this misoptimization incurs -// significant slowdown in targets with limited addressing modes. For instance, -// because the PTX target does not support the reg+reg addressing mode, the -// NVPTX backend emits PTX code that literally computes the pointer address of -// each GEP, wasting tons of registers. It emits the following PTX for the -// first load and similar PTX for other loads. -// -// mov.u32 %r1, %x; -// mov.u32 %r2, %y; -// mul.wide.u32 %rl2, %r1, 128; -// mov.u64 %rl3, a; -// add.s64 %rl4, %rl3, %rl2; -// mul.wide.u32 %rl5, %r2, 4; -// add.s64 %rl6, %rl4, %rl5; -// ld.global.f32 %f1, [%rl6]; -// -// To reduce the register pressure, the optimization implemented in this file -// merges the common part of a group of GEPs, so we can compute each pointer -// address by adding a simple offset to the common part, saving many registers. -// -// It works by splitting each GEP into a variadic base and a constant offset. -// The variadic base can be computed once and reused by multiple GEPs, and the -// constant offsets can be nicely folded into the reg+immediate addressing mode -// (supported by most targets) without using any extra register. -// -// For instance, we transform the four GEPs and four loads in the above example -// into: -// -// base = gep a, 0, x, y -// load base -// laod base + 1 * sizeof(float) -// load base + 32 * sizeof(float) -// load base + 33 * sizeof(float) -// -// Given the transformed IR, a backend that supports the reg+immediate -// addressing mode can easily fold the pointer arithmetics into the loads. For -// example, the NVPTX backend can easily fold the pointer arithmetics into the -// ld.global.f32 instructions, and the resultant PTX uses much fewer registers. -// -// mov.u32 %r1, %tid.x; -// mov.u32 %r2, %tid.y; -// mul.wide.u32 %rl2, %r1, 128; -// mov.u64 %rl3, a; -// add.s64 %rl4, %rl3, %rl2; -// mul.wide.u32 %rl5, %r2, 4; -// add.s64 %rl6, %rl4, %rl5; -// ld.global.f32 %f1, [%rl6]; // so far the same as unoptimized PTX -// ld.global.f32 %f2, [%rl6+4]; // much better -// ld.global.f32 %f3, [%rl6+128]; // much better -// ld.global.f32 %f4, [%rl6+132]; // much better -// -// Another improvement enabled by the LowerGEP flag is to lower a GEP with -// multiple indices to either multiple GEPs with a single index or arithmetic -// operations (depending on whether the target uses alias analysis in codegen). -// Such transformation can have following benefits: -// (1) It can always extract constants in the indices of structure type. -// (2) After such Lowering, there are more optimization opportunities such as -// CSE, LICM and CGP. -// -// E.g. The following GEPs have multiple indices: -// BB1: -// %p = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 3 -// load %p -// ... -// BB2: -// %p2 = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 2 -// load %p2 -// ... -// -// We can not do CSE for to the common part related to index "i64 %i". Lowering -// GEPs can achieve such goals. -// If the target does not use alias analysis in codegen, this pass will -// lower a GEP with multiple indices into arithmetic operations: -// BB1: -// %1 = ptrtoint [10 x %struct]* %ptr to i64 ; CSE opportunity -// %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity -// %3 = add i64 %1, %2 ; CSE opportunity -// %4 = mul i64 %j1, length_of_struct -// %5 = add i64 %3, %4 -// %6 = add i64 %3, struct_field_3 ; Constant offset -// %p = inttoptr i64 %6 to i32* -// load %p -// ... -// BB2: -// %7 = ptrtoint [10 x %struct]* %ptr to i64 ; CSE opportunity -// %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity -// %9 = add i64 %7, %8 ; CSE opportunity -// %10 = mul i64 %j2, length_of_struct -// %11 = add i64 %9, %10 -// %12 = add i64 %11, struct_field_2 ; Constant offset -// %p = inttoptr i64 %12 to i32* -// load %p2 -// ... -// -// If the target uses alias analysis in codegen, this pass will lower a GEP -// with multiple indices into multiple GEPs with a single index: -// BB1: -// %1 = bitcast [10 x %struct]* %ptr to i8* ; CSE opportunity -// %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity -// %3 = getelementptr i8* %1, i64 %2 ; CSE opportunity -// %4 = mul i64 %j1, length_of_struct -// %5 = getelementptr i8* %3, i64 %4 -// %6 = getelementptr i8* %5, struct_field_3 ; Constant offset -// %p = bitcast i8* %6 to i32* -// load %p -// ... -// BB2: -// %7 = bitcast [10 x %struct]* %ptr to i8* ; CSE opportunity -// %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity -// %9 = getelementptr i8* %7, i64 %8 ; CSE opportunity -// %10 = mul i64 %j2, length_of_struct -// %11 = getelementptr i8* %9, i64 %10 -// %12 = getelementptr i8* %11, struct_field_2 ; Constant offset -// %p2 = bitcast i8* %12 to i32* -// load %p2 -// ... -// -// Lowering GEPs can also benefit other passes such as LICM and CGP. -// LICM (Loop Invariant Code Motion) can not hoist/sink a GEP of multiple -// indices if one of the index is variant. If we lower such GEP into invariant -// parts and variant parts, LICM can hoist/sink those invariant parts. -// CGP (CodeGen Prepare) tries to sink address calculations that match the -// target's addressing modes. A GEP with multiple indices may not match and will -// not be sunk. If we lower such GEP into smaller parts, CGP may sink some of -// them. So we end up with a better addressing mode. -// -//===----------------------------------------------------------------------===// - -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/Operator.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Scalar.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetSubtargetInfo.h" -#include "llvm/IR/IRBuilder.h" - -using namespace llvm; - -static cl::opt DisableSeparateConstOffsetFromGEP( - "disable-separate-const-offset-from-gep", cl::init(false), - cl::desc("Do not separate the constant offset from a GEP instruction"), - cl::Hidden); - -namespace { - -/// \brief A helper class for separating a constant offset from a GEP index. -/// -/// In real programs, a GEP index may be more complicated than a simple addition -/// of something and a constant integer which can be trivially splitted. For -/// example, to split ((a << 3) | 5) + b, we need to search deeper for the -/// constant offset, so that we can separate the index to (a << 3) + b and 5. -/// -/// Therefore, this class looks into the expression that computes a given GEP -/// index, and tries to find a constant integer that can be hoisted to the -/// outermost level of the expression as an addition. Not every constant in an -/// expression can jump out. e.g., we cannot transform (b * (a + 5)) to (b * a + -/// 5); nor can we transform (3 * (a + 5)) to (3 * a + 5), however in this case, -/// -instcombine probably already optimized (3 * (a + 5)) to (3 * a + 15). -class ConstantOffsetExtractor { - public: - /// Extracts a constant offset from the given GEP index. It returns the - /// new index representing the remainder (equal to the original index minus - /// the constant offset), or nullptr if we cannot extract a constant offset. - /// \p Idx The given GEP index - /// \p GEP The given GEP - static Value *Extract(Value *Idx, GetElementPtrInst *GEP); - /// Looks for a constant offset from the given GEP index without extracting - /// it. It returns the numeric value of the extracted constant offset (0 if - /// failed). The meaning of the arguments are the same as Extract. - static int64_t Find(Value *Idx, GetElementPtrInst *GEP); - - private: - ConstantOffsetExtractor(Instruction *InsertionPt) : IP(InsertionPt) {} - /// Searches the expression that computes V for a non-zero constant C s.t. - /// V can be reassociated into the form V' + C. If the searching is - /// successful, returns C and update UserChain as a def-use chain from C to V; - /// otherwise, UserChain is empty. - /// - /// \p V The given expression - /// \p SignExtended Whether V will be sign-extended in the computation of the - /// GEP index - /// \p ZeroExtended Whether V will be zero-extended in the computation of the - /// GEP index - /// \p NonNegative Whether V is guaranteed to be non-negative. For example, - /// an index of an inbounds GEP is guaranteed to be - /// non-negative. Levaraging this, we can better split - /// inbounds GEPs. - APInt find(Value *V, bool SignExtended, bool ZeroExtended, bool NonNegative); - /// A helper function to look into both operands of a binary operator. - APInt findInEitherOperand(BinaryOperator *BO, bool SignExtended, - bool ZeroExtended); - /// After finding the constant offset C from the GEP index I, we build a new - /// index I' s.t. I' + C = I. This function builds and returns the new - /// index I' according to UserChain produced by function "find". - /// - /// The building conceptually takes two steps: - /// 1) iteratively distribute s/zext towards the leaves of the expression tree - /// that computes I - /// 2) reassociate the expression tree to the form I' + C. - /// - /// For example, to extract the 5 from sext(a + (b + 5)), we first distribute - /// sext to a, b and 5 so that we have - /// sext(a) + (sext(b) + 5). - /// Then, we reassociate it to - /// (sext(a) + sext(b)) + 5. - /// Given this form, we know I' is sext(a) + sext(b). - Value *rebuildWithoutConstOffset(); - /// After the first step of rebuilding the GEP index without the constant - /// offset, distribute s/zext to the operands of all operators in UserChain. - /// e.g., zext(sext(a + (b + 5)) (assuming no overflow) => - /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))). - /// - /// The function also updates UserChain to point to new subexpressions after - /// distributing s/zext. e.g., the old UserChain of the above example is - /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)), - /// and the new UserChain is - /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) -> - /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5)) - /// - /// \p ChainIndex The index to UserChain. ChainIndex is initially - /// UserChain.size() - 1, and is decremented during - /// the recursion. - Value *distributeExtsAndCloneChain(unsigned ChainIndex); - /// Reassociates the GEP index to the form I' + C and returns I'. - Value *removeConstOffset(unsigned ChainIndex); - /// A helper function to apply ExtInsts, a list of s/zext, to value V. - /// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function - /// returns "sext i32 (zext i16 V to i32) to i64". - Value *applyExts(Value *V); - - /// Returns true if LHS and RHS have no bits in common, i.e., LHS | RHS == 0. - bool NoCommonBits(Value *LHS, Value *RHS) const; - /// Computes which bits are known to be one or zero. - /// \p KnownOne Mask of all bits that are known to be one. - /// \p KnownZero Mask of all bits that are known to be zero. - void ComputeKnownBits(Value *V, APInt &KnownOne, APInt &KnownZero) const; - /// A helper function that returns whether we can trace into the operands - /// of binary operator BO for a constant offset. - /// - /// \p SignExtended Whether BO is surrounded by sext - /// \p ZeroExtended Whether BO is surrounded by zext - /// \p NonNegative Whether BO is known to be non-negative, e.g., an in-bound - /// array index. - bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO, - bool NonNegative); - - /// The path from the constant offset to the old GEP index. e.g., if the GEP - /// index is "a * b + (c + 5)". After running function find, UserChain[0] will - /// be the constant 5, UserChain[1] will be the subexpression "c + 5", and - /// UserChain[2] will be the entire expression "a * b + (c + 5)". - /// - /// This path helps to rebuild the new GEP index. - SmallVector UserChain; - /// A data structure used in rebuildWithoutConstOffset. Contains all - /// sext/zext instructions along UserChain. - SmallVector ExtInsts; - Instruction *IP; /// Insertion position of cloned instructions. -}; - -/// \brief A pass that tries to split every GEP in the function into a variadic -/// base and a constant offset. It is a FunctionPass because searching for the -/// constant offset may inspect other basic blocks. -class SeparateConstOffsetFromGEP : public FunctionPass { - public: - static char ID; - SeparateConstOffsetFromGEP(const TargetMachine *TM = nullptr, - bool LowerGEP = false) - : FunctionPass(ID), TM(TM), LowerGEP(LowerGEP) { - initializeSeparateConstOffsetFromGEPPass(*PassRegistry::getPassRegistry()); - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.setPreservesCFG(); - } - - bool runOnFunction(Function &F) override; - - private: - /// Tries to split the given GEP into a variadic base and a constant offset, - /// and returns true if the splitting succeeds. - bool splitGEP(GetElementPtrInst *GEP); - /// Lower a GEP with multiple indices into multiple GEPs with a single index. - /// Function splitGEP already split the original GEP into a variadic part and - /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the - /// variadic part into a set of GEPs with a single index and applies - /// AccumulativeByteOffset to it. - /// \p Variadic The variadic part of the original GEP. - /// \p AccumulativeByteOffset The constant offset. - void lowerToSingleIndexGEPs(GetElementPtrInst *Variadic, - int64_t AccumulativeByteOffset); - /// Lower a GEP with multiple indices into ptrtoint+arithmetics+inttoptr form. - /// Function splitGEP already split the original GEP into a variadic part and - /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the - /// variadic part into a set of arithmetic operations and applies - /// AccumulativeByteOffset to it. - /// \p Variadic The variadic part of the original GEP. - /// \p AccumulativeByteOffset The constant offset. - void lowerToArithmetics(GetElementPtrInst *Variadic, - int64_t AccumulativeByteOffset); - /// Finds the constant offset within each index and accumulates them. If - /// LowerGEP is true, it finds in indices of both sequential and structure - /// types, otherwise it only finds in sequential indices. The output - /// NeedsExtraction indicates whether we successfully find a non-zero constant - /// offset. - int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction); - /// Canonicalize array indices to pointer-size integers. This helps to - /// simplify the logic of splitting a GEP. For example, if a + b is a - /// pointer-size integer, we have - /// gep base, a + b = gep (gep base, a), b - /// However, this equality may not hold if the size of a + b is smaller than - /// the pointer size, because LLVM conceptually sign-extends GEP indices to - /// pointer size before computing the address - /// (http://llvm.org/docs/LangRef.html#id181). - /// - /// This canonicalization is very likely already done in clang and - /// instcombine. Therefore, the program will probably remain the same. - /// - /// Returns true if the module changes. - /// - /// Verified in @i32_add in split-gep.ll - bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP); - - const TargetMachine *TM; - /// Whether to lower a GEP with multiple indices into arithmetic operations or - /// multiple GEPs with a single index. - bool LowerGEP; -}; -} // anonymous namespace - -char SeparateConstOffsetFromGEP::ID = 0; -INITIALIZE_PASS_BEGIN( - SeparateConstOffsetFromGEP, "separate-const-offset-from-gep", - "Split GEPs to a variadic base and a constant offset for better CSE", false, - false) -INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_END( - SeparateConstOffsetFromGEP, "separate-const-offset-from-gep", - "Split GEPs to a variadic base and a constant offset for better CSE", false, - false) - -FunctionPass * -llvm::createSeparateConstOffsetFromGEPPass(const TargetMachine *TM, - bool LowerGEP) { - return new SeparateConstOffsetFromGEP(TM, LowerGEP); -} - -bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended, - bool ZeroExtended, - BinaryOperator *BO, - bool NonNegative) { - // We only consider ADD, SUB and OR, because a non-zero constant found in - // expressions composed of these operations can be easily hoisted as a - // constant offset by reassociation. - if (BO->getOpcode() != Instruction::Add && - BO->getOpcode() != Instruction::Sub && - BO->getOpcode() != Instruction::Or) { - return false; - } - - Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1); - // Do not trace into "or" unless it is equivalent to "add". If LHS and RHS - // don't have common bits, (LHS | RHS) is equivalent to (LHS + RHS). - if (BO->getOpcode() == Instruction::Or && !NoCommonBits(LHS, RHS)) - return false; - - // In addition, tracing into BO requires that its surrounding s/zext (if - // any) is distributable to both operands. - // - // Suppose BO = A op B. - // SignExtended | ZeroExtended | Distributable? - // --------------+--------------+---------------------------------- - // 0 | 0 | true because no s/zext exists - // 0 | 1 | zext(BO) == zext(A) op zext(B) - // 1 | 0 | sext(BO) == sext(A) op sext(B) - // 1 | 1 | zext(sext(BO)) == - // | | zext(sext(A)) op zext(sext(B)) - if (BO->getOpcode() == Instruction::Add && !ZeroExtended && NonNegative) { - // If a + b >= 0 and (a >= 0 or b >= 0), then - // sext(a + b) = sext(a) + sext(b) - // even if the addition is not marked nsw. - // - // Leveraging this invarient, we can trace into an sext'ed inbound GEP - // index if the constant offset is non-negative. - // - // Verified in @sext_add in split-gep.ll. - if (ConstantInt *ConstLHS = dyn_cast(LHS)) { - if (!ConstLHS->isNegative()) - return true; - } - if (ConstantInt *ConstRHS = dyn_cast(RHS)) { - if (!ConstRHS->isNegative()) - return true; - } - } - - // sext (add/sub nsw A, B) == add/sub nsw (sext A), (sext B) - // zext (add/sub nuw A, B) == add/sub nuw (zext A), (zext B) - if (BO->getOpcode() == Instruction::Add || - BO->getOpcode() == Instruction::Sub) { - if (SignExtended && !BO->hasNoSignedWrap()) - return false; - if (ZeroExtended && !BO->hasNoUnsignedWrap()) - return false; - } - - return true; -} - -APInt ConstantOffsetExtractor::findInEitherOperand(BinaryOperator *BO, - bool SignExtended, - bool ZeroExtended) { - // BO being non-negative does not shed light on whether its operands are - // non-negative. Clear the NonNegative flag here. - APInt ConstantOffset = find(BO->getOperand(0), SignExtended, ZeroExtended, - /* NonNegative */ false); - // If we found a constant offset in the left operand, stop and return that. - // This shortcut might cause us to miss opportunities of combining the - // constant offsets in both operands, e.g., (a + 4) + (b + 5) => (a + b) + 9. - // However, such cases are probably already handled by -instcombine, - // given this pass runs after the standard optimizations. - if (ConstantOffset != 0) return ConstantOffset; - ConstantOffset = find(BO->getOperand(1), SignExtended, ZeroExtended, - /* NonNegative */ false); - // If U is a sub operator, negate the constant offset found in the right - // operand. - if (BO->getOpcode() == Instruction::Sub) - ConstantOffset = -ConstantOffset; - return ConstantOffset; -} - -APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended, - bool ZeroExtended, bool NonNegative) { - // TODO(jingyue): We could trace into integer/pointer casts, such as - // inttoptr, ptrtoint, bitcast, and addrspacecast. We choose to handle only - // integers because it gives good enough results for our benchmarks. - unsigned BitWidth = cast(V->getType())->getBitWidth(); - - // We cannot do much with Values that are not a User, such as an Argument. - User *U = dyn_cast(V); - if (U == nullptr) return APInt(BitWidth, 0); - - APInt ConstantOffset(BitWidth, 0); - if (ConstantInt *CI = dyn_cast(V)) { - // Hooray, we found it! - ConstantOffset = CI->getValue(); - } else if (BinaryOperator *BO = dyn_cast(V)) { - // Trace into subexpressions for more hoisting opportunities. - if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative)) { - ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended); - } - } else if (isa(V)) { - ConstantOffset = find(U->getOperand(0), /* SignExtended */ true, - ZeroExtended, NonNegative).sext(BitWidth); - } else if (isa(V)) { - // As an optimization, we can clear the SignExtended flag because - // sext(zext(a)) = zext(a). Verified in @sext_zext in split-gep.ll. - // - // Clear the NonNegative flag, because zext(a) >= 0 does not imply a >= 0. - ConstantOffset = - find(U->getOperand(0), /* SignExtended */ false, - /* ZeroExtended */ true, /* NonNegative */ false).zext(BitWidth); - } - - // If we found a non-zero constant offset, add it to the path for - // rebuildWithoutConstOffset. Zero is a valid constant offset, but doesn't - // help this optimization. - if (ConstantOffset != 0) - UserChain.push_back(U); - return ConstantOffset; -} - -Value *ConstantOffsetExtractor::applyExts(Value *V) { - Value *Current = V; - // ExtInsts is built in the use-def order. Therefore, we apply them to V - // in the reversed order. - for (auto I = ExtInsts.rbegin(), E = ExtInsts.rend(); I != E; ++I) { - if (Constant *C = dyn_cast(Current)) { - // If Current is a constant, apply s/zext using ConstantExpr::getCast. - // ConstantExpr::getCast emits a ConstantInt if C is a ConstantInt. - Current = ConstantExpr::getCast((*I)->getOpcode(), C, (*I)->getType()); - } else { - Instruction *Ext = (*I)->clone(); - Ext->setOperand(0, Current); - Ext->insertBefore(IP); - Current = Ext; - } - } - return Current; -} - -Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() { - distributeExtsAndCloneChain(UserChain.size() - 1); - // Remove all nullptrs (used to be s/zext) from UserChain. - unsigned NewSize = 0; - for (auto I = UserChain.begin(), E = UserChain.end(); I != E; ++I) { - if (*I != nullptr) { - UserChain[NewSize] = *I; - NewSize++; - } - } - UserChain.resize(NewSize); - return removeConstOffset(UserChain.size() - 1); -} - -Value * -ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) { - User *U = UserChain[ChainIndex]; - if (ChainIndex == 0) { - assert(isa(U)); - // If U is a ConstantInt, applyExts will return a ConstantInt as well. - return UserChain[ChainIndex] = cast(applyExts(U)); - } - - if (CastInst *Cast = dyn_cast(U)) { - assert((isa(Cast) || isa(Cast)) && - "We only traced into two types of CastInst: sext and zext"); - ExtInsts.push_back(Cast); - UserChain[ChainIndex] = nullptr; - return distributeExtsAndCloneChain(ChainIndex - 1); - } - - // Function find only trace into BinaryOperator and CastInst. - BinaryOperator *BO = cast(U); - // OpNo = which operand of BO is UserChain[ChainIndex - 1] - unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1); - Value *TheOther = applyExts(BO->getOperand(1 - OpNo)); - Value *NextInChain = distributeExtsAndCloneChain(ChainIndex - 1); - - BinaryOperator *NewBO = nullptr; - if (OpNo == 0) { - NewBO = BinaryOperator::Create(BO->getOpcode(), NextInChain, TheOther, - BO->getName(), IP); - } else { - NewBO = BinaryOperator::Create(BO->getOpcode(), TheOther, NextInChain, - BO->getName(), IP); - } - return UserChain[ChainIndex] = NewBO; -} - -Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { - if (ChainIndex == 0) { - assert(isa(UserChain[ChainIndex])); - return ConstantInt::getNullValue(UserChain[ChainIndex]->getType()); - } - - BinaryOperator *BO = cast(UserChain[ChainIndex]); - unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1); - assert(BO->getOperand(OpNo) == UserChain[ChainIndex - 1]); - Value *NextInChain = removeConstOffset(ChainIndex - 1); - Value *TheOther = BO->getOperand(1 - OpNo); - - // If NextInChain is 0 and not the LHS of a sub, we can simplify the - // sub-expression to be just TheOther. - if (ConstantInt *CI = dyn_cast(NextInChain)) { - if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0)) - return TheOther; - } - - if (BO->getOpcode() == Instruction::Or) { - // Rebuild "or" as "add", because "or" may be invalid for the new - // epxression. - // - // For instance, given - // a | (b + 5) where a and b + 5 have no common bits, - // we can extract 5 as the constant offset. - // - // However, reusing the "or" in the new index would give us - // (a | b) + 5 - // which does not equal a | (b + 5). - // - // Replacing the "or" with "add" is fine, because - // a | (b + 5) = a + (b + 5) = (a + b) + 5 - if (OpNo == 0) { - return BinaryOperator::CreateAdd(NextInChain, TheOther, BO->getName(), - IP); - } else { - return BinaryOperator::CreateAdd(TheOther, NextInChain, BO->getName(), - IP); - } - } - - // We can reuse BO in this case, because the new expression shares the same - // instruction type and BO is used at most once. - assert(BO->getNumUses() <= 1 && - "distributeExtsAndCloneChain clones each BinaryOperator in " - "UserChain, so no one should be used more than " - "once"); - BO->setOperand(OpNo, NextInChain); - BO->setHasNoSignedWrap(false); - BO->setHasNoUnsignedWrap(false); - // Make sure it appears after all instructions we've inserted so far. - BO->moveBefore(IP); - return BO; -} - -Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP) { - ConstantOffsetExtractor Extractor(GEP); - // Find a non-zero constant offset first. - APInt ConstantOffset = - Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false, - GEP->isInBounds()); - if (ConstantOffset == 0) - return nullptr; - // Separates the constant offset from the GEP index. - return Extractor.rebuildWithoutConstOffset(); -} - -int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP) { - // If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative. - return ConstantOffsetExtractor(GEP) - .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false, - GEP->isInBounds()) - .getSExtValue(); -} - -void ConstantOffsetExtractor::ComputeKnownBits(Value *V, APInt &KnownOne, - APInt &KnownZero) const { - IntegerType *IT = cast(V->getType()); - KnownOne = APInt(IT->getBitWidth(), 0); - KnownZero = APInt(IT->getBitWidth(), 0); - const DataLayout &DL = IP->getModule()->getDataLayout(); - llvm::computeKnownBits(V, KnownZero, KnownOne, DL, 0); -} - -bool ConstantOffsetExtractor::NoCommonBits(Value *LHS, Value *RHS) const { - assert(LHS->getType() == RHS->getType() && - "LHS and RHS should have the same type"); - APInt LHSKnownOne, LHSKnownZero, RHSKnownOne, RHSKnownZero; - ComputeKnownBits(LHS, LHSKnownOne, LHSKnownZero); - ComputeKnownBits(RHS, RHSKnownOne, RHSKnownZero); - return (LHSKnownZero | RHSKnownZero).isAllOnesValue(); -} - -bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToPointerSize( - GetElementPtrInst *GEP) { - bool Changed = false; - const DataLayout &DL = GEP->getModule()->getDataLayout(); - Type *IntPtrTy = DL.getIntPtrType(GEP->getType()); - gep_type_iterator GTI = gep_type_begin(*GEP); - for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end(); - I != E; ++I, ++GTI) { - // Skip struct member indices which must be i32. - if (isa(*GTI)) { - if ((*I)->getType() != IntPtrTy) { - *I = CastInst::CreateIntegerCast(*I, IntPtrTy, true, "idxprom", GEP); - Changed = true; - } - } - } - return Changed; -} - -int64_t -SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP, - bool &NeedsExtraction) { - NeedsExtraction = false; - int64_t AccumulativeByteOffset = 0; - gep_type_iterator GTI = gep_type_begin(*GEP); - const DataLayout &DL = GEP->getModule()->getDataLayout(); - for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) { - if (isa(*GTI)) { - // Tries to extract a constant offset from this GEP index. - int64_t ConstantOffset = - ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP); - if (ConstantOffset != 0) { - NeedsExtraction = true; - // A GEP may have multiple indices. We accumulate the extracted - // constant offset to a byte offset, and later offset the remainder of - // the original GEP with this byte offset. - AccumulativeByteOffset += - ConstantOffset * DL.getTypeAllocSize(GTI.getIndexedType()); - } - } else if (LowerGEP) { - StructType *StTy = cast(*GTI); - uint64_t Field = cast(GEP->getOperand(I))->getZExtValue(); - // Skip field 0 as the offset is always 0. - if (Field != 0) { - NeedsExtraction = true; - AccumulativeByteOffset += - DL.getStructLayout(StTy)->getElementOffset(Field); - } - } - } - return AccumulativeByteOffset; -} - -void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs( - GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset) { - IRBuilder<> Builder(Variadic); - const DataLayout &DL = Variadic->getModule()->getDataLayout(); - Type *IntPtrTy = DL.getIntPtrType(Variadic->getType()); - - Type *I8PtrTy = - Builder.getInt8PtrTy(Variadic->getType()->getPointerAddressSpace()); - Value *ResultPtr = Variadic->getOperand(0); - if (ResultPtr->getType() != I8PtrTy) - ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy); - - gep_type_iterator GTI = gep_type_begin(*Variadic); - // Create an ugly GEP for each sequential index. We don't create GEPs for - // structure indices, as they are accumulated in the constant offset index. - for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) { - if (isa(*GTI)) { - Value *Idx = Variadic->getOperand(I); - // Skip zero indices. - if (ConstantInt *CI = dyn_cast(Idx)) - if (CI->isZero()) - continue; - - APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(), - DL.getTypeAllocSize(GTI.getIndexedType())); - // Scale the index by element size. - if (ElementSize != 1) { - if (ElementSize.isPowerOf2()) { - Idx = Builder.CreateShl( - Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2())); - } else { - Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize)); - } - } - // Create an ugly GEP with a single index for each index. - ResultPtr = - Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Idx, "uglygep"); - } - } - - // Create a GEP with the constant offset index. - if (AccumulativeByteOffset != 0) { - Value *Offset = ConstantInt::get(IntPtrTy, AccumulativeByteOffset); - ResultPtr = - Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Offset, "uglygep"); - } - if (ResultPtr->getType() != Variadic->getType()) - ResultPtr = Builder.CreateBitCast(ResultPtr, Variadic->getType()); - - Variadic->replaceAllUsesWith(ResultPtr); - Variadic->eraseFromParent(); -} - -void -SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic, - int64_t AccumulativeByteOffset) { - IRBuilder<> Builder(Variadic); - const DataLayout &DL = Variadic->getModule()->getDataLayout(); - Type *IntPtrTy = DL.getIntPtrType(Variadic->getType()); - - Value *ResultPtr = Builder.CreatePtrToInt(Variadic->getOperand(0), IntPtrTy); - gep_type_iterator GTI = gep_type_begin(*Variadic); - // Create ADD/SHL/MUL arithmetic operations for each sequential indices. We - // don't create arithmetics for structure indices, as they are accumulated - // in the constant offset index. - for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) { - if (isa(*GTI)) { - Value *Idx = Variadic->getOperand(I); - // Skip zero indices. - if (ConstantInt *CI = dyn_cast(Idx)) - if (CI->isZero()) - continue; - - APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(), - DL.getTypeAllocSize(GTI.getIndexedType())); - // Scale the index by element size. - if (ElementSize != 1) { - if (ElementSize.isPowerOf2()) { - Idx = Builder.CreateShl( - Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2())); - } else { - Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize)); - } - } - // Create an ADD for each index. - ResultPtr = Builder.CreateAdd(ResultPtr, Idx); - } - } - - // Create an ADD for the constant offset index. - if (AccumulativeByteOffset != 0) { - ResultPtr = Builder.CreateAdd( - ResultPtr, ConstantInt::get(IntPtrTy, AccumulativeByteOffset)); - } - - ResultPtr = Builder.CreateIntToPtr(ResultPtr, Variadic->getType()); - Variadic->replaceAllUsesWith(ResultPtr); - Variadic->eraseFromParent(); -} - -bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { - // Skip vector GEPs. - if (GEP->getType()->isVectorTy()) - return false; - - // The backend can already nicely handle the case where all indices are - // constant. - if (GEP->hasAllConstantIndices()) - return false; - - bool Changed = canonicalizeArrayIndicesToPointerSize(GEP); - - bool NeedsExtraction; - int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction); - - if (!NeedsExtraction) - return Changed; - // If LowerGEP is disabled, before really splitting the GEP, check whether the - // backend supports the addressing mode we are about to produce. If no, this - // splitting probably won't be beneficial. - // If LowerGEP is enabled, even the extracted constant offset can not match - // the addressing mode, we can still do optimizations to other lowered parts - // of variable indices. Therefore, we don't check for addressing modes in that - // case. - if (!LowerGEP) { - TargetTransformInfo &TTI = - getAnalysis().getTTI( - *GEP->getParent()->getParent()); - if (!TTI.isLegalAddressingMode(GEP->getType()->getElementType(), - /*BaseGV=*/nullptr, AccumulativeByteOffset, - /*HasBaseReg=*/true, /*Scale=*/0)) { - return Changed; - } - } - - // Remove the constant offset in each sequential index. The resultant GEP - // computes the variadic base. - // Notice that we don't remove struct field indices here. If LowerGEP is - // disabled, a structure index is not accumulated and we still use the old - // one. If LowerGEP is enabled, a structure index is accumulated in the - // constant offset. LowerToSingleIndexGEPs or lowerToArithmetics will later - // handle the constant offset and won't need a new structure index. - gep_type_iterator GTI = gep_type_begin(*GEP); - for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) { - if (isa(*GTI)) { - // Splits this GEP index into a variadic part and a constant offset, and - // uses the variadic part as the new index. - Value *NewIdx = ConstantOffsetExtractor::Extract(GEP->getOperand(I), GEP); - if (NewIdx != nullptr) { - GEP->setOperand(I, NewIdx); - } - } - } - - // Clear the inbounds attribute because the new index may be off-bound. - // e.g., - // - // b = add i64 a, 5 - // addr = gep inbounds float* p, i64 b - // - // is transformed to: - // - // addr2 = gep float* p, i64 a - // addr = gep float* addr2, i64 5 - // - // If a is -4, although the old index b is in bounds, the new index a is - // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the - // inbounds keyword is not present, the offsets are added to the base - // address with silently-wrapping two's complement arithmetic". - // Therefore, the final code will be a semantically equivalent. - // - // TODO(jingyue): do some range analysis to keep as many inbounds as - // possible. GEPs with inbounds are more friendly to alias analysis. - GEP->setIsInBounds(false); - - // Lowers a GEP to either GEPs with a single index or arithmetic operations. - if (LowerGEP) { - // As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to - // arithmetic operations if the target uses alias analysis in codegen. - if (TM && TM->getSubtargetImpl(*GEP->getParent()->getParent())->useAA()) - lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset); - else - lowerToArithmetics(GEP, AccumulativeByteOffset); - return true; - } - - // No need to create another GEP if the accumulative byte offset is 0. - if (AccumulativeByteOffset == 0) - return true; - - // Offsets the base with the accumulative byte offset. - // - // %gep ; the base - // ... %gep ... - // - // => add the offset - // - // %gep2 ; clone of %gep - // %new.gep = gep %gep2, - // %gep ; will be removed - // ... %gep ... - // - // => replace all uses of %gep with %new.gep and remove %gep - // - // %gep2 ; clone of %gep - // %new.gep = gep %gep2, - // ... %new.gep ... - // - // If AccumulativeByteOffset is not a multiple of sizeof(*%gep), we emit an - // uglygep (http://llvm.org/docs/GetElementPtr.html#what-s-an-uglygep): - // bitcast %gep2 to i8*, add the offset, and bitcast the result back to the - // type of %gep. - // - // %gep2 ; clone of %gep - // %0 = bitcast %gep2 to i8* - // %uglygep = gep %0, - // %new.gep = bitcast %uglygep to - // ... %new.gep ... - Instruction *NewGEP = GEP->clone(); - NewGEP->insertBefore(GEP); - - // Per ANSI C standard, signed / unsigned = unsigned and signed % unsigned = - // unsigned.. Therefore, we cast ElementTypeSizeOfGEP to signed because it is - // used with unsigned integers later. - const DataLayout &DL = GEP->getModule()->getDataLayout(); - int64_t ElementTypeSizeOfGEP = static_cast( - DL.getTypeAllocSize(GEP->getType()->getElementType())); - Type *IntPtrTy = DL.getIntPtrType(GEP->getType()); - if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) { - // Very likely. As long as %gep is natually aligned, the byte offset we - // extracted should be a multiple of sizeof(*%gep). - int64_t Index = AccumulativeByteOffset / ElementTypeSizeOfGEP; - NewGEP = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP, - ConstantInt::get(IntPtrTy, Index, true), - GEP->getName(), GEP); - } else { - // Unlikely but possible. For example, - // #pragma pack(1) - // struct S { - // int a[3]; - // int64 b[8]; - // }; - // #pragma pack() - // - // Suppose the gep before extraction is &s[i + 1].b[j + 3]. After - // extraction, it becomes &s[i].b[j] and AccumulativeByteOffset is - // sizeof(S) + 3 * sizeof(int64) = 100, which is not a multiple of - // sizeof(int64). - // - // Emit an uglygep in this case. - Type *I8PtrTy = Type::getInt8PtrTy(GEP->getContext(), - GEP->getPointerAddressSpace()); - NewGEP = new BitCastInst(NewGEP, I8PtrTy, "", GEP); - NewGEP = GetElementPtrInst::Create( - Type::getInt8Ty(GEP->getContext()), NewGEP, - ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), "uglygep", - GEP); - if (GEP->getType() != I8PtrTy) - NewGEP = new BitCastInst(NewGEP, GEP->getType(), GEP->getName(), GEP); - } - - GEP->replaceAllUsesWith(NewGEP); - GEP->eraseFromParent(); - - return true; -} - -bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) { - if (skipOptnoneFunction(F)) - return false; - - if (DisableSeparateConstOffsetFromGEP) - return false; - - bool Changed = false; - for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) { - for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ) { - if (GetElementPtrInst *GEP = dyn_cast(I++)) { - Changed |= splitGEP(GEP); - } - // No need to split GEP ConstantExprs because all its indices are constant - // already. - } - } - return Changed; -} Index: lib/Transforms/Scalar/StraightLineStrengthReduce.cpp =================================================================== --- lib/Transforms/Scalar/StraightLineStrengthReduce.cpp +++ lib/Transforms/Scalar/StraightLineStrengthReduce.cpp @@ -11,7 +11,7 @@ // strength reduction, this algorithm is designed to reduce arithmetic // redundancy in straight-line code instead of loops. It has proven to be // effective in simplifying arithmetic statements derived from an unrolled loop. -// It can also simplify the logic of SeparateConstOffsetFromGEP. +// It can also simplify the logic of ReassociateGEPs. // // There are many optimizations we can perform in the domain of SLSR. This file // for now contains only an initial step. Specifically, we look for strength Index: test/CodeGen/AArch64/aarch64-gep-opt.ll =================================================================== --- test/CodeGen/AArch64/aarch64-gep-opt.ll +++ test/CodeGen/AArch64/aarch64-gep-opt.ll @@ -4,7 +4,7 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "aarch64-linux-gnueabi" -; Following test cases test enabling SeparateConstOffsetFromGEP pass in AArch64 +; Following test cases test enabling ReassociateGEPs pass in AArch64 ; backend. If useAA() returns true, it will lower a GEP with multiple indices ; into GEPs with a single index, otherwise it will lower it into a ; "ptrtoint+arithmetics+inttoptr" form. @@ -150,7 +150,7 @@ ; CHECK-UseAA-LABEL: @test-struct_2( ; CHECK-UseAA: getelementptr i8, i8* %{{[a-zA-Z0-9]+}}, i64 -40 -; Test that when a index is added from two constant, SeparateConstOffsetFromGEP +; Test that when a index is added from two constant, ReassociateGEPs ; pass does not generate incorrect result. define void @test_const_add([3 x i32]* %in) { %inc = add nsw i32 2, 1 Index: test/CodeGen/PowerPC/ppc64-gep-opt.ll =================================================================== --- test/CodeGen/PowerPC/ppc64-gep-opt.ll +++ test/CodeGen/PowerPC/ppc64-gep-opt.ll @@ -4,7 +4,7 @@ target datalayout = "E-m:e-i64:64-n32:64" target triple = "powerpc64-unknown-linux-gnu" -; Following test cases test enabling SeparateConstOffsetFromGEP pass in the PPC +; Following test cases test enabling ReassociateGEPs pass in the PPC ; backend. If useAA() returns true, it will lower a GEP with multiple indices ; into GEPs with a single index, otherwise it will lower it into a ; "ptrtoint+arithmetics+inttoptr" form. @@ -141,7 +141,7 @@ ; CHECK-UseAA-LABEL: @test-struct_2( ; CHECK-UseAA: getelementptr i8, i8* %{{[a-zA-Z0-9]+}}, i64 -40 -; Test that when a index is added from two constant, SeparateConstOffsetFromGEP +; Test that when a index is added from two constant, ReassociateGEPs ; pass does not generate incorrect result. define void @test_const_add([3 x i32]* %in) { %inc = add nsw i32 2, 1 Index: test/Transforms/ReassociateGEPs/NVPTX/split-gep-and-gvn.ll =================================================================== --- test/Transforms/ReassociateGEPs/NVPTX/split-gep-and-gvn.ll +++ test/Transforms/ReassociateGEPs/NVPTX/split-gep-and-gvn.ll @@ -1,11 +1,11 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX -; RUN: opt < %s -S -separate-const-offset-from-gep -gvn -dce | FileCheck %s --check-prefix=IR +; RUN: opt < %s -S -reassociate-geps -gvn -dce | FileCheck %s --check-prefix=IR -; Verifies the SeparateConstOffsetFromGEP pass. +; Verifies the ReassociateGEPs pass. ; The following code computes ; *output = array[x][y] + array[x][y+1] + array[x+1][y] + array[x+1][y+1] ; -; We expect SeparateConstOffsetFromGEP to transform it to +; We expect ReassociateGEPs to transform it to ; ; float *base = &a[x][y]; ; *output = base[0] + base[1] + base[32] + base[33]; @@ -60,7 +60,7 @@ ; the order of "sext" and "add" when computing the array indices. @sum_of_array ; computes add before sext, e.g., array[sext(x + 1)][sext(y + 1)], while ; @sum_of_array2 computes sext before add, -; e.g., array[sext(x) + 1][sext(y) + 1]. SeparateConstOffsetFromGEP should be +; e.g., array[sext(x) + 1][sext(y) + 1]. ReassociateGEPs should be ; able to extract constant offsets from both forms. define void @sum_of_array2(i32 %x, i32 %y, float* nocapture %output) { .preheader: Index: test/Transforms/ReassociateGEPs/NVPTX/split-gep.ll =================================================================== --- test/Transforms/ReassociateGEPs/NVPTX/split-gep.ll +++ test/Transforms/ReassociateGEPs/NVPTX/split-gep.ll @@ -1,6 +1,6 @@ -; RUN: opt < %s -separate-const-offset-from-gep -dce -S | FileCheck %s +; RUN: opt < %s -reassociate-geps -dce -S | FileCheck %s -; Several unit tests for -separate-const-offset-from-gep. The transformation +; Several unit tests for -reassociate-geps. The transformation ; heavily relies on TargetTransformInfo, so we put these tests under ; target-specific folders. @@ -245,7 +245,7 @@ %or = or i64 %add, 1 ; CHECK: [[OR:%or[0-9]*]] = add i64 %shl, 1 ; ((a << 2) + 12) and 1 have no common bits. Therefore, - ; SeparateConstOffsetFromGEP is able to extract the 12. + ; ReassociateGEPs is able to extract the 12. ; TODO(jingyue): We could reassociate the expression to combine 12 and 1. %p = getelementptr float, float* %ptr, i64 %or ; CHECK: [[PTR:%[a-zA-Z0-9]+]] = getelementptr float, float* %ptr, i64 [[OR]] Index: test/Transforms/ReassociateGEPs/NVPTX/split-licm-gep.ll =================================================================== --- test/Transforms/ReassociateGEPs/NVPTX/split-licm-gep.ll +++ test/Transforms/ReassociateGEPs/NVPTX/split-licm-gep.ll @@ -0,0 +1,161 @@ +; RUN: opt < %s -reassociate-geps -dce -licm -S | FileCheck %s + +; Several unit tests for -reassociate-geps focusing on splitting GEPs for +; LICM. The transformation heavily relies on TargetTransformInfo, so we put +; these tests under target-specific folders. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +; target triple is necessary; otherwise TargetTransformInfo rejects any +; addressing mode. +target triple = "nvptx64-unknown-unknown" + +define void @simple_licm(i32* %input, i32* %output, i32 %a) { +; CHECK-LABEL: @simple_licm +entry: +; CHECK: entry: +; CHECK-NEXT: [[SEXT1:%[a-zA-Z0-9]+]] = sext i32 %a to i64 +; CHECK-NEXT: [[GEP1:%[a-zA-Z0-9]+]] = getelementptr i32, i32* %input, i64 [[SEXT1]] + br label %loop + +loop: +; CHECK: [[SEXT2:%[a-zA-Z0-9]+]] = sext i32 %i to i64 +; CHECK-NEXT: [[GEP2:%[a-zA-Z0-9]+]] = getelementptr i32, i32* [[GEP1]], i64 [[SEXT2]] + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + %idx = add nsw i32 %a, %i + %idx.sext = sext i32 %idx to i64 + %arrayidx = getelementptr i32, i32* %input, i64 %idx.sext + %0 = load i32, i32* %arrayidx + + ; Dummy store to prevent DCE. + store i32 %0, i32* %output + + %i.next = add nuw nsw i32 %i, 1 + %exitcond = icmp eq i32 %i.next, 1000000 + br i1 %exitcond, label %loop.exit, label %loop + +loop.exit: + ret void +} + +; GEP index with both a loop-invariant subexpression and a constant. +; Constant should be hoisted first followed by the loop-invariant +; reassociation and splitting. +define void @gep_with_const(i32* %input, i32* %output, i64 %a) { +; CHECK-LABEL: @gep_with_const +entry: +; CHECK: entry: +; CHECK-NEXT: [[GEP1:%[a-zA-Z0-9]+]] = getelementptr i32, i32* %input, i64 %a + br label %loop + +loop: +; CHECK: loop: +; CHECK: [[GEP2:%[a-zA-Z0-9]+]] = getelementptr i32, i32* [[GEP1]], i64 %i +; CHECK: %{{[a-zA-Z0-9]+}} = getelementptr i32, i32* [[GEP2]], i64 1234 + %i = phi i64 [ 0, %entry ], [ %i.next, %loop ] + %sum = add nsw i64 %a, 1234 + %idx = add nsw i64 %sum, %i + %arrayidx = getelementptr i32, i32* %input, i64 %idx + %0 = load i32, i32* %arrayidx + + ; Dummy store to prevent DCE. + store i32 %0, i32* %output + + %i.next = add nuw nsw i64 %i, 1 + %exitcond = icmp eq i64 %i.next, 1000000 + br i1 %exitcond, label %loop.exit, label %loop + +loop.exit: + ret void +} + +; GEP with an already invariant index should not be split. +define void @already_invariant_gep(i32* %input, i32* %output, i64 %a) { +; CHECK-LABEL: @already_invariant_gep( +entry: +; CHECK: getelementptr i32, i32* %input, i64 %idx + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %i.next, %loop ] + %idx = add nsw i64 %a, %a + %arrayidx = getelementptr i32, i32* %input, i64 %idx + %0 = load i32, i32* %arrayidx + + ; Dummy store to prevent DCE. + store i32 %0, i32* %output + + %i.next = add nuw nsw i64 %i, 1 + %exitcond = icmp eq i64 %i.next, 1000000 + br i1 %exitcond, label %loop.exit, label %loop + +loop.exit: + ret void +} + +; GEP with an index which is not loop invariant should not be split. +define void @no_loop_invariance(i32* %input, i32* %output) { +; CHECK-LABEL: @no_loop_invariance( +entry: + br label %loop + +loop: +; CHECK: loop: +; CHECK: getelementptr i32, i32* %input, i64 %idx + %i = phi i64 [ 0, %entry ], [ %i.next, %loop ] + %idx = add nsw i64 %i, %i + %arrayidx = getelementptr i32, i32* %input, i64 %idx + %0 = load i32, i32* %arrayidx + + ; Dummy store to prevent DCE. + store i32 %0, i32* %output + + %i.next = add nuw nsw i64 %i, 1 + %exitcond = icmp eq i64 %i.next, 1000000 + br i1 %exitcond, label %loop.exit, label %loop + +loop.exit: + ret void +} + +; Doubly nested loop containing GEP instruction with multiple indices. GEP can +; be split at two of the indices forming two new loop-invariant geps which are +; hoisted to different loop-depths. +define void @double_loop([5 x [6 x i32]]* %input, i32* %output, i64 %a) { +; CHECK-LABEL: @double_loop( +entry: +; CHECK: entry: +; CHECK-NEXT: [[GEP1:%[a-zA-Z0-9]+]] = getelementptr [5 x [6 x i32]], [5 x [6 x i32]]* %input, i64 %a, i64 %a + br label %loop1 + +loop1: + %i = phi i64 [ 0, %entry ], [ %i.next, %loop2.exit ] +; CHECK: [[GEP2:%[a-zA-Z0-9]+]] = getelementptr [6 x i32], [6 x i32]* [[GEP1]], i64 %i, i64 %i +; CHECK-NEXT: br label %loop2 + br label %loop2 + +loop2: + %j = phi i64 [ 0, %loop1 ], [ %j.next, %loop2 ] +; CHECK: %j = phi +; CHECK-NEXT: [[GEP3:%[a-zA-Z0-9]+]] = getelementptr i32, i32* [[GEP2]], i64 %j +; CHECK-NEXT: %{{[a-zA-Z0-9]+}} = load i32, i32* [[GEP3]] + + %idx1 = add nsw i64 %a, %i + %idx2 = add nsw i64 %i, %j + %arrayidx = getelementptr [5 x [6 x i32]], [5 x [6 x i32]]* %input, i64 %a, i64 %idx1, i64 %idx2 + %0 = load i32, i32* %arrayidx + + ; Dummy store to prevent DCE. + store i32 %0, i32* %output + + %j.next = add nuw nsw i64 %j, 1 + %exitcond2 = icmp eq i64 %j.next, 1000000 + br i1 %exitcond2, label %loop2.exit, label %loop2 + +loop2.exit: + %i.next = add nuw nsw i64 %i, 1 + %exitcond1 = icmp eq i64 %i.next, 1000000 + br i1 %exitcond1, label %loop1.exit, label %loop1 + +loop1.exit: + ret void +} Index: test/Transforms/SeparateConstOffsetFromGEP/NVPTX/lit.local.cfg =================================================================== --- test/Transforms/SeparateConstOffsetFromGEP/NVPTX/lit.local.cfg +++ test/Transforms/SeparateConstOffsetFromGEP/NVPTX/lit.local.cfg @@ -1,3 +0,0 @@ -if not 'NVPTX' in config.root.targets: - config.unsupported = True - Index: test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll =================================================================== --- test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll +++ test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll @@ -1,196 +0,0 @@ -; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX -; RUN: opt < %s -S -separate-const-offset-from-gep -gvn -dce | FileCheck %s --check-prefix=IR - -; Verifies the SeparateConstOffsetFromGEP pass. -; The following code computes -; *output = array[x][y] + array[x][y+1] + array[x+1][y] + array[x+1][y+1] -; -; We expect SeparateConstOffsetFromGEP to transform it to -; -; float *base = &a[x][y]; -; *output = base[0] + base[1] + base[32] + base[33]; -; -; so the backend can emit PTX that uses fewer virtual registers. - -target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" -target triple = "nvptx64-unknown-unknown" - -@array = internal addrspace(3) constant [32 x [32 x float]] zeroinitializer, align 4 - -define void @sum_of_array(i32 %x, i32 %y, float* nocapture %output) { -.preheader: - %0 = sext i32 %y to i64 - %1 = sext i32 %x to i64 - %2 = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %0 - %3 = addrspacecast float addrspace(3)* %2 to float* - %4 = load float, float* %3, align 4 - %5 = fadd float %4, 0.000000e+00 - %6 = add i32 %y, 1 - %7 = sext i32 %6 to i64 - %8 = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %7 - %9 = addrspacecast float addrspace(3)* %8 to float* - %10 = load float, float* %9, align 4 - %11 = fadd float %5, %10 - %12 = add i32 %x, 1 - %13 = sext i32 %12 to i64 - %14 = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %13, i64 %0 - %15 = addrspacecast float addrspace(3)* %14 to float* - %16 = load float, float* %15, align 4 - %17 = fadd float %11, %16 - %18 = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %13, i64 %7 - %19 = addrspacecast float addrspace(3)* %18 to float* - %20 = load float, float* %19, align 4 - %21 = fadd float %17, %20 - store float %21, float* %output, align 4 - ret void -} -; PTX-LABEL: sum_of_array( -; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rd|r)[0-9]+]]{{\]}} -; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}} -; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}} -; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}} - -; IR-LABEL: @sum_of_array( -; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}} -; IR: getelementptr float, float addrspace(3)* [[BASE_PTR]], i64 1 -; IR: getelementptr float, float addrspace(3)* [[BASE_PTR]], i64 32 -; IR: getelementptr float, float addrspace(3)* [[BASE_PTR]], i64 33 - -; @sum_of_array2 is very similar to @sum_of_array. The only difference is in -; the order of "sext" and "add" when computing the array indices. @sum_of_array -; computes add before sext, e.g., array[sext(x + 1)][sext(y + 1)], while -; @sum_of_array2 computes sext before add, -; e.g., array[sext(x) + 1][sext(y) + 1]. SeparateConstOffsetFromGEP should be -; able to extract constant offsets from both forms. -define void @sum_of_array2(i32 %x, i32 %y, float* nocapture %output) { -.preheader: - %0 = sext i32 %y to i64 - %1 = sext i32 %x to i64 - %2 = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %0 - %3 = addrspacecast float addrspace(3)* %2 to float* - %4 = load float, float* %3, align 4 - %5 = fadd float %4, 0.000000e+00 - %6 = add i64 %0, 1 - %7 = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %6 - %8 = addrspacecast float addrspace(3)* %7 to float* - %9 = load float, float* %8, align 4 - %10 = fadd float %5, %9 - %11 = add i64 %1, 1 - %12 = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %11, i64 %0 - %13 = addrspacecast float addrspace(3)* %12 to float* - %14 = load float, float* %13, align 4 - %15 = fadd float %10, %14 - %16 = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %11, i64 %6 - %17 = addrspacecast float addrspace(3)* %16 to float* - %18 = load float, float* %17, align 4 - %19 = fadd float %15, %18 - store float %19, float* %output, align 4 - ret void -} -; PTX-LABEL: sum_of_array2( -; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rd|r)[0-9]+]]{{\]}} -; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}} -; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}} -; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}} - -; IR-LABEL: @sum_of_array2( -; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}} -; IR: getelementptr float, float addrspace(3)* [[BASE_PTR]], i64 1 -; IR: getelementptr float, float addrspace(3)* [[BASE_PTR]], i64 32 -; IR: getelementptr float, float addrspace(3)* [[BASE_PTR]], i64 33 - - -; This function loads -; array[zext(x)][zext(y)] -; array[zext(x)][zext(y +nuw 1)] -; array[zext(x +nuw 1)][zext(y)] -; array[zext(x +nuw 1)][zext(y +nuw 1)]. -; -; This function is similar to @sum_of_array, but it -; 1) extends array indices using zext instead of sext; -; 2) annotates the addition with "nuw"; otherwise, zext(x + 1) => zext(x) + 1 -; may be invalid. -define void @sum_of_array3(i32 %x, i32 %y, float* nocapture %output) { -.preheader: - %0 = zext i32 %y to i64 - %1 = zext i32 %x to i64 - %2 = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %0 - %3 = addrspacecast float addrspace(3)* %2 to float* - %4 = load float, float* %3, align 4 - %5 = fadd float %4, 0.000000e+00 - %6 = add nuw i32 %y, 1 - %7 = zext i32 %6 to i64 - %8 = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %7 - %9 = addrspacecast float addrspace(3)* %8 to float* - %10 = load float, float* %9, align 4 - %11 = fadd float %5, %10 - %12 = add nuw i32 %x, 1 - %13 = zext i32 %12 to i64 - %14 = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %13, i64 %0 - %15 = addrspacecast float addrspace(3)* %14 to float* - %16 = load float, float* %15, align 4 - %17 = fadd float %11, %16 - %18 = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %13, i64 %7 - %19 = addrspacecast float addrspace(3)* %18 to float* - %20 = load float, float* %19, align 4 - %21 = fadd float %17, %20 - store float %21, float* %output, align 4 - ret void -} -; PTX-LABEL: sum_of_array3( -; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rd|r)[0-9]+]]{{\]}} -; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}} -; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}} -; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}} - -; IR-LABEL: @sum_of_array3( -; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}} -; IR: getelementptr float, float addrspace(3)* [[BASE_PTR]], i64 1 -; IR: getelementptr float, float addrspace(3)* [[BASE_PTR]], i64 32 -; IR: getelementptr float, float addrspace(3)* [[BASE_PTR]], i64 33 - - -; This function loads -; array[zext(x)][zext(y)] -; array[zext(x)][zext(y)] -; array[zext(x) + 1][zext(y) + 1] -; array[zext(x) + 1][zext(y) + 1]. -; -; We expect the generated code to reuse the computation of -; &array[zext(x)][zext(y)]. See the expected IR and PTX for details. -define void @sum_of_array4(i32 %x, i32 %y, float* nocapture %output) { -.preheader: - %0 = zext i32 %y to i64 - %1 = zext i32 %x to i64 - %2 = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %0 - %3 = addrspacecast float addrspace(3)* %2 to float* - %4 = load float, float* %3, align 4 - %5 = fadd float %4, 0.000000e+00 - %6 = add i64 %0, 1 - %7 = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %6 - %8 = addrspacecast float addrspace(3)* %7 to float* - %9 = load float, float* %8, align 4 - %10 = fadd float %5, %9 - %11 = add i64 %1, 1 - %12 = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %11, i64 %0 - %13 = addrspacecast float addrspace(3)* %12 to float* - %14 = load float, float* %13, align 4 - %15 = fadd float %10, %14 - %16 = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %11, i64 %6 - %17 = addrspacecast float addrspace(3)* %16 to float* - %18 = load float, float* %17, align 4 - %19 = fadd float %15, %18 - store float %19, float* %output, align 4 - ret void -} -; PTX-LABEL: sum_of_array4( -; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rd|r)[0-9]+]]{{\]}} -; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}} -; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}} -; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}} - -; IR-LABEL: @sum_of_array4( -; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}} -; IR: getelementptr float, float addrspace(3)* [[BASE_PTR]], i64 1 -; IR: getelementptr float, float addrspace(3)* [[BASE_PTR]], i64 32 -; IR: getelementptr float, float addrspace(3)* [[BASE_PTR]], i64 33 Index: test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll =================================================================== --- test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll +++ test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll @@ -1,279 +0,0 @@ -; RUN: opt < %s -separate-const-offset-from-gep -dce -S | FileCheck %s - -; Several unit tests for -separate-const-offset-from-gep. The transformation -; heavily relies on TargetTransformInfo, so we put these tests under -; target-specific folders. - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -; target triple is necessary; otherwise TargetTransformInfo rejects any -; addressing mode. -target triple = "nvptx64-unknown-unknown" - -%struct.S = type { float, double } - -@struct_array = global [1024 x %struct.S] zeroinitializer, align 16 -@float_2d_array = global [32 x [32 x float]] zeroinitializer, align 4 - -; We should not extract any struct field indices, because fields in a struct -; may have different types. -define double* @struct(i32 %i) { -entry: - %add = add nsw i32 %i, 5 - %idxprom = sext i32 %add to i64 - %p = getelementptr inbounds [1024 x %struct.S], [1024 x %struct.S]* @struct_array, i64 0, i64 %idxprom, i32 1 - ret double* %p -} -; CHECK-LABEL: @struct( -; CHECK: getelementptr [1024 x %struct.S], [1024 x %struct.S]* @struct_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i32 1 - -; We should be able to trace into sext(a + b) if a + b is non-negative -; (e.g., used as an index of an inbounds GEP) and one of a and b is -; non-negative. -define float* @sext_add(i32 %i, i32 %j) { -entry: - %0 = add i32 %i, 1 - %1 = sext i32 %0 to i64 ; inbound sext(i + 1) = sext(i) + 1 - %2 = add i32 %j, -2 - ; However, inbound sext(j + -2) != sext(j) + -2, e.g., j = INT_MIN - %3 = sext i32 %2 to i64 - %p = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 %1, i64 %3 - ret float* %p -} -; CHECK-LABEL: @sext_add( -; CHECK-NOT: = add -; CHECK: add i32 %j, -2 -; CHECK: sext -; CHECK: getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}} -; CHECK: getelementptr float, float* %{{[a-zA-Z0-9]+}}, i64 32 - -; We should be able to trace into sext/zext if it can be distributed to both -; operands, e.g., sext (add nsw a, b) == add nsw (sext a), (sext b) -; -; This test verifies we can transform -; gep base, a + sext(b +nsw 1), c + zext(d +nuw 1) -; to -; gep base, a + sext(b), c + zext(d); gep ..., 1 * 32 + 1 -define float* @ext_add_no_overflow(i64 %a, i32 %b, i64 %c, i32 %d) { - %b1 = add nsw i32 %b, 1 - %b2 = sext i32 %b1 to i64 - %i = add i64 %a, %b2 ; i = a + sext(b +nsw 1) - %d1 = add nuw i32 %d, 1 - %d2 = zext i32 %d1 to i64 - %j = add i64 %c, %d2 ; j = c + zext(d +nuw 1) - %p = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i, i64 %j - ret float* %p -} -; CHECK-LABEL: @ext_add_no_overflow( -; CHECK: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}} -; CHECK: getelementptr float, float* [[BASE_PTR]], i64 33 - -; Verifies we handle nested sext/zext correctly. -define void @sext_zext(i32 %a, i32 %b, float** %out1, float** %out2) { -entry: - %0 = add nsw nuw i32 %a, 1 - %1 = sext i32 %0 to i48 - %2 = zext i48 %1 to i64 ; zext(sext(a +nsw nuw 1)) = zext(sext(a)) + 1 - %3 = add nsw i32 %b, 2 - %4 = sext i32 %3 to i48 - %5 = zext i48 %4 to i64 ; zext(sext(b +nsw 2)) != zext(sext(b)) + 2 - %p1 = getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 %2, i64 %5 - store float* %p1, float** %out1 - %6 = add nuw i32 %a, 3 - %7 = zext i32 %6 to i48 - %8 = sext i48 %7 to i64 ; sext(zext(a +nuw 3)) = zext(a +nuw 3) = zext(a) + 3 - %9 = add nsw i32 %b, 4 - %10 = zext i32 %9 to i48 - %11 = sext i48 %10 to i64 ; sext(zext(b +nsw 4)) != zext(b) + 4 - %p2 = getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 %8, i64 %11 - store float* %p2, float** %out2 - ret void -} -; CHECK-LABEL: @sext_zext( -; CHECK: [[BASE_PTR_1:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}} -; CHECK: getelementptr float, float* [[BASE_PTR_1]], i64 32 -; CHECK: [[BASE_PTR_2:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}} -; CHECK: getelementptr float, float* [[BASE_PTR_2]], i64 96 - -; Similar to @ext_add_no_overflow, we should be able to trace into s/zext if -; its operand is an OR and the two operands of the OR have no common bits. -define float* @sext_or(i64 %a, i32 %b) { -entry: - %b1 = shl i32 %b, 2 - %b2 = or i32 %b1, 1 ; (b << 2) and 1 have no common bits - %b3 = or i32 %b1, 4 ; (b << 2) and 4 may have common bits - %b2.ext = zext i32 %b2 to i64 - %b3.ext = sext i32 %b3 to i64 - %i = add i64 %a, %b2.ext - %j = add i64 %a, %b3.ext - %p = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i, i64 %j - ret float* %p -} -; CHECK-LABEL: @sext_or( -; CHECK: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}} -; CHECK: getelementptr float, float* [[BASE_PTR]], i64 32 - -; The subexpression (b + 5) is used in both "i = a + (b + 5)" and "*out = b + -; 5". When extracting the constant offset 5, make sure "*out = b + 5" isn't -; affected. -define float* @expr(i64 %a, i64 %b, i64* %out) { -entry: - %b5 = add i64 %b, 5 - %i = add i64 %b5, %a - %p = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i, i64 0 - store i64 %b5, i64* %out - ret float* %p -} -; CHECK-LABEL: @expr( -; CHECK: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 0 -; CHECK: getelementptr float, float* [[BASE_PTR]], i64 160 -; CHECK: store i64 %b5, i64* %out - -; d + sext(a +nsw (b +nsw (c +nsw 8))) => (d + sext(a) + sext(b) + sext(c)) + 8 -define float* @sext_expr(i32 %a, i32 %b, i32 %c, i64 %d) { -entry: - %0 = add nsw i32 %c, 8 - %1 = add nsw i32 %b, %0 - %2 = add nsw i32 %a, %1 - %3 = sext i32 %2 to i64 - %i = add i64 %d, %3 - %p = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 0, i64 %i - ret float* %p -} -; CHECK-LABEL: @sext_expr( -; CHECK: sext i32 -; CHECK: sext i32 -; CHECK: sext i32 -; CHECK: getelementptr float, float* %{{[a-zA-Z0-9]+}}, i64 8 - -; Verifies we handle "sub" correctly. -define float* @sub(i64 %i, i64 %j) { - %i2 = sub i64 %i, 5 ; i - 5 - %j2 = sub i64 5, %j ; 5 - i - %p = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i2, i64 %j2 - ret float* %p -} -; CHECK-LABEL: @sub( -; CHECK: %[[j2:[a-zA-Z0-9]+]] = sub i64 0, %j -; CHECK: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i, i64 %[[j2]] -; CHECK: getelementptr float, float* [[BASE_PTR]], i64 -155 - -%struct.Packed = type <{ [3 x i32], [8 x i64] }> ; <> means packed - -; Verifies we can emit correct uglygep if the address is not natually aligned. -define i64* @packed_struct(i32 %i, i32 %j) { -entry: - %s = alloca [1024 x %struct.Packed], align 16 - %add = add nsw i32 %j, 3 - %idxprom = sext i32 %add to i64 - %add1 = add nsw i32 %i, 1 - %idxprom2 = sext i32 %add1 to i64 - %arrayidx3 = getelementptr inbounds [1024 x %struct.Packed], [1024 x %struct.Packed]* %s, i64 0, i64 %idxprom2, i32 1, i64 %idxprom - ret i64* %arrayidx3 -} -; CHECK-LABEL: @packed_struct( -; CHECK: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [1024 x %struct.Packed], [1024 x %struct.Packed]* %s, i64 0, i64 %{{[a-zA-Z0-9]+}}, i32 1, i64 %{{[a-zA-Z0-9]+}} -; CHECK: [[CASTED_PTR:%[a-zA-Z0-9]+]] = bitcast i64* [[BASE_PTR]] to i8* -; CHECK: %uglygep = getelementptr i8, i8* [[CASTED_PTR]], i64 100 -; CHECK: bitcast i8* %uglygep to i64* - -; We shouldn't be able to extract the 8 from "zext(a +nuw (b + 8))", -; because "zext(b + 8) != zext(b) + 8" -define float* @zext_expr(i32 %a, i32 %b) { -entry: - %0 = add i32 %b, 8 - %1 = add nuw i32 %a, %0 - %i = zext i32 %1 to i64 - %p = getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 0, i64 %i - ret float* %p -} -; CHECK-LABEL: zext_expr( -; CHECK: getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 0, i64 %i - -; Per http://llvm.org/docs/LangRef.html#id181, the indices of a off-bound gep -; should be considered sign-extended to the pointer size. Therefore, -; gep base, (add i32 a, b) != gep (gep base, i32 a), i32 b -; because -; sext(a + b) != sext(a) + sext(b) -; -; This test verifies we do not illegitimately extract the 8 from -; gep base, (i32 a + 8) -define float* @i32_add(i32 %a) { -entry: - %i = add i32 %a, 8 - %p = getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 0, i32 %i - ret float* %p -} -; CHECK-LABEL: @i32_add( -; CHECK: getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 0, i64 %{{[a-zA-Z0-9]+}} -; CHECK-NOT: getelementptr - -; Verifies that we compute the correct constant offset when the index is -; sign-extended and then zero-extended. The old version of our code failed to -; handle this case because it simply computed the constant offset as the -; sign-extended value of the constant part of the GEP index. -define float* @apint(i1 %a) { -entry: - %0 = add nsw nuw i1 %a, 1 - %1 = sext i1 %0 to i4 - %2 = zext i4 %1 to i64 ; zext (sext i1 1 to i4) to i64 = 15 - %p = getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 0, i64 %2 - ret float* %p -} -; CHECK-LABEL: @apint( -; CHECK: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 0, i64 %{{[a-zA-Z0-9]+}} -; CHECK: getelementptr float, float* [[BASE_PTR]], i64 15 - -; Do not trace into binary operators other than ADD, SUB, and OR. -define float* @and(i64 %a) { -entry: - %0 = shl i64 %a, 2 - %1 = and i64 %0, 1 - %p = getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 0, i64 %1 - ret float* %p -} -; CHECK-LABEL: @and( -; CHECK: getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array -; CHECK-NOT: getelementptr - -; The code that rebuilds an OR expression used to be buggy, and failed on this -; test. -define float* @shl_add_or(i64 %a, float* %ptr) { -; CHECK-LABEL: @shl_add_or( -entry: - %shl = shl i64 %a, 2 - %add = add i64 %shl, 12 - %or = or i64 %add, 1 -; CHECK: [[OR:%or[0-9]*]] = add i64 %shl, 1 - ; ((a << 2) + 12) and 1 have no common bits. Therefore, - ; SeparateConstOffsetFromGEP is able to extract the 12. - ; TODO(jingyue): We could reassociate the expression to combine 12 and 1. - %p = getelementptr float, float* %ptr, i64 %or -; CHECK: [[PTR:%[a-zA-Z0-9]+]] = getelementptr float, float* %ptr, i64 [[OR]] -; CHECK: getelementptr float, float* [[PTR]], i64 12 - ret float* %p -; CHECK-NEXT: ret -} - -; The source code used to be buggy in checking -; (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) -; where AccumulativeByteOffset is signed but ElementTypeSizeOfGEP is unsigned. -; The compiler would promote AccumulativeByteOffset to unsigned, causing -; unexpected results. For example, while -64 % (int64_t)24 != 0, -; -64 % (uint64_t)24 == 0. -%struct3 = type { i64, i32 } -%struct2 = type { %struct3, i32 } -%struct1 = type { i64, %struct2 } -%struct0 = type { i32, i32, i64*, [100 x %struct1] } -define %struct2* @sign_mod_unsign(%struct0* %ptr, i64 %idx) { -; CHECK-LABEL: @sign_mod_unsign( -entry: - %arrayidx = add nsw i64 %idx, -2 -; CHECK-NOT: add - %ptr2 = getelementptr inbounds %struct0, %struct0* %ptr, i64 0, i32 3, i64 %arrayidx, i32 1 -; CHECK: [[PTR:%[a-zA-Z0-9]+]] = getelementptr %struct0, %struct0* %ptr, i64 0, i32 3, i64 %idx, i32 1 -; CHECK: [[PTR1:%[a-zA-Z0-9]+]] = bitcast %struct2* [[PTR]] to i8* -; CHECK: getelementptr i8, i8* [[PTR1]], i64 -64 -; CHECK: bitcast - ret %struct2* %ptr2 -; CHECK-NEXT: ret -}