Index: lib/Transforms/Scalar/NaryReassociate.cpp =================================================================== --- lib/Transforms/Scalar/NaryReassociate.cpp +++ lib/Transforms/Scalar/NaryReassociate.cpp @@ -85,6 +85,7 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" @@ -104,6 +105,10 @@ initializeNaryReassociatePass(*PassRegistry::getPassRegistry()); } + bool doInitialization(Module &M) override { + DL = &M.getDataLayout(); + return false; + } bool runOnFunction(Function &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -113,6 +118,7 @@ AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.setPreservesCFG(); } @@ -120,16 +126,49 @@ // Runs only one iteration of the dominator-based algorithm. See the header // comments for why we need multiple iterations. bool doOneIteration(Function &F); - // Reasssociates I to a better form. - Instruction *tryReassociateAdd(Instruction *I); + + // Reassociates I for better CSE. + Instruction *tryReassociate(Instruction *I); + + // Reassociate GEP for better CSE. + Instruction *tryReassociateGEP(GetElementPtrInst *GEP); + // Try splitting GEP at the I-th index and see whether either part can be + // CSE'ed. This is a helper function for tryReassociateGEP. + // + // \p IndexedType The element type indexed by GEP's I-th index. This is + // equivalent to + // GEP->getIndexedType(GEP->getPointerOperand(), 0-th index, + // ..., i-th index). + GetElementPtrInst *tryReassociateGEPAtIndex(GetElementPtrInst *GEP, + unsigned I, Type *IndexedType); + // Given GEP's I-th index = LHS + RHS, see whether &Base[..][LHS][..] or + // &Base[..][RHS][..] can be CSE'ed and rewrite GEP accordingly. + GetElementPtrInst *tryReassociateGEPAtIndex(GetElementPtrInst *GEP, + unsigned I, Value *LHS, + Value *RHS, Type *IndexedType); + + // Reassociate Add for better CSE. + Instruction *tryReassociateAdd(BinaryOperator *I); // A helper function for tryReassociateAdd. LHS and RHS are explicitly passed. Instruction *tryReassociateAdd(Value *LHS, Value *RHS, Instruction *I); // Rewrites I to LHS + RHS if LHS is computed already. Instruction *tryReassociatedAdd(const SCEV *LHS, Value *RHS, Instruction *I); + // Returns the closest dominator of \c Dominatee that computes + // \c CandidateExpr. Returns null if not found. + Instruction *findClosestMatchingDominator(const SCEV *CandidateExpr, + Instruction *Dominatee); + // GetElementPtrInst implicitly sign-extends an index if the index is shorter + // than the pointer size. This function returns whether Index is shorter than + // GEP's pointer size, i.e., whether Index needs to be sign-extended in order + // to be an index of GEP. + bool requiresSignExtension(Value *Index, GetElementPtrInst *GEP); + DominatorTree *DT; ScalarEvolution *SE; TargetLibraryInfo *TLI; + TargetTransformInfo *TTI; + const DataLayout *DL; // A lookup table quickly telling which instructions compute the given SCEV. // Note that there can be multiple instructions at different locations // computing to the same SCEV, so we map a SCEV to an instruction list. For @@ -149,6 +188,7 @@ INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(NaryReassociate, "nary-reassociate", "Nary reassociation", false, false) @@ -163,6 +203,7 @@ DT = &getAnalysis().getDomTree(); SE = &getAnalysis(); TLI = &getAnalysis().getTLI(); + TTI = &getAnalysis().getTTI(F); bool Changed = false, ChangedInThisIteration; do { @@ -172,26 +213,36 @@ return Changed; } +// Whitelist the instruction types NaryReassociate handles for now. +static bool isPotentiallyNaryReassociable(Instruction *I) { + switch (I->getOpcode()) { + case Instruction::Add: + case Instruction::GetElementPtr: + return true; + default: + return false; + } +} + bool NaryReassociate::doOneIteration(Function &F) { bool Changed = false; SeenExprs.clear(); - // Traverse the dominator tree in the depth-first order. This order makes sure - // all bases of a candidate are in Candidates when we process it. + // Process the basic blocks in pre-order of the dominator tree. This order + // ensures that all bases of a candidate are in Candidates when we process it. for (auto Node = GraphTraits::nodes_begin(DT); Node != GraphTraits::nodes_end(DT); ++Node) { BasicBlock *BB = Node->getBlock(); for (auto I = BB->begin(); I != BB->end(); ++I) { - // Skip vector types which are not SCEVable. - if (I->getOpcode() == Instruction::Add && !I->getType()->isVectorTy()) { - if (Instruction *NewI = tryReassociateAdd(I)) { + if (SE->isSCEVable(I->getType()) && isPotentiallyNaryReassociable(I)) { + if (Instruction *NewI = tryReassociate(I)) { Changed = true; SE->forgetValue(I); I->replaceAllUsesWith(NewI); RecursivelyDeleteTriviallyDeadInstructions(I, TLI); I = NewI; } - // We should add the rewritten instruction because tryReassociateAdd may - // have invalidated the original one. + // Add the rewritten instruction to SeenExprs; the original instruction + // is deleted. SeenExprs[SE->getSCEV(I)].push_back(I); } } @@ -199,7 +250,168 @@ return Changed; } -Instruction *NaryReassociate::tryReassociateAdd(Instruction *I) { +Instruction *NaryReassociate::tryReassociate(Instruction *I) { + switch (I->getOpcode()) { + case Instruction::Add: + return tryReassociateAdd(cast(I)); + case Instruction::GetElementPtr: + return tryReassociateGEP(cast(I)); + default: + llvm_unreachable("should be filtered out by isPotentiallyNaryReassociable"); + } +} + +// FIXME: extract this method into TTI->getGEPCost. +static bool isGEPFoldable(GetElementPtrInst *GEP, + const TargetTransformInfo *TTI, + const DataLayout *DL) { + GlobalVariable *BaseGV = nullptr; + int64_t BaseOffset = 0; + bool HasBaseReg = false; + int64_t Scale = 0; + + if (GlobalVariable *GV = dyn_cast(GEP->getPointerOperand())) + BaseGV = GV; + else + HasBaseReg = true; + + gep_type_iterator GTI = gep_type_begin(GEP); + for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I, ++GTI) { + if (isa(*GTI)) { + int64_t ElementSize = DL->getTypeAllocSize(GTI.getIndexedType()); + if (ConstantInt *ConstIdx = dyn_cast(*I)) { + BaseOffset += ConstIdx->getSExtValue() * ElementSize; + } else { + // Needs scale register. + if (Scale != 0) { + // No addressing mode takes two scale registers. + return false; + } + Scale = ElementSize; + } + } else { + StructType *STy = cast(*GTI); + uint64_t Field = cast(*I)->getZExtValue(); + BaseOffset += DL->getStructLayout(STy)->getElementOffset(Field); + } + } + return TTI->isLegalAddressingMode(GEP->getType()->getElementType(), BaseGV, + BaseOffset, HasBaseReg, Scale); +} + +Instruction *NaryReassociate::tryReassociateGEP(GetElementPtrInst *GEP) { + // Not worth reassociating GEP if it is foldable. + if (isGEPFoldable(GEP, TTI, DL)) + return nullptr; + + gep_type_iterator GTI = gep_type_begin(*GEP); + for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) { + if (isa(*GTI++)) { + if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I - 1, *GTI)) { + return NewGEP; + } + } + } + return nullptr; +} + +bool NaryReassociate::requiresSignExtension(Value *Index, + GetElementPtrInst *GEP) { + unsigned PointerSizeInBits = + DL->getPointerSizeInBits(GEP->getType()->getPointerAddressSpace()); + return cast(Index->getType())->getBitWidth() < PointerSizeInBits; +} + +GetElementPtrInst * +NaryReassociate::tryReassociateGEPAtIndex(GetElementPtrInst *GEP, unsigned I, + Type *IndexedType) { + Value *IndexToSplit = GEP->getOperand(I + 1); + if (SExtInst *SExt = dyn_cast(IndexToSplit)) + IndexToSplit = SExt->getOperand(0); + + if (AddOperator *AO = dyn_cast(IndexToSplit)) { + // If the I-th index needs sext and the underlying add is not equipped with + // nsw, we cannot split the add because + // sext(LHS + RHS) != sext(LHS) + sext(RHS). + if (requiresSignExtension(IndexToSplit, GEP) && !AO->hasNoSignedWrap()) + return nullptr; + Value *LHS = AO->getOperand(0), *RHS = AO->getOperand(1); + // IndexToSplit = LHS + RHS. + if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType)) + return NewGEP; + // Symmetrically, try IndexToSplit = RHS + LHS. + if (LHS != RHS) { + if (auto *NewGEP = + tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType)) + return NewGEP; + } + } + return nullptr; +} + +GetElementPtrInst * +NaryReassociate::tryReassociateGEPAtIndex(GetElementPtrInst *GEP, unsigned I, + Value *LHS, Value *RHS, + Type *IndexedType) { + // Look for GEP's closest dominator that has the same SCEV as GEP except that + // the I-th index is replaced with LHS. + SmallVector IndexExprs; + for (auto Index = GEP->idx_begin(); Index != GEP->idx_end(); ++Index) + IndexExprs.push_back(SE->getSCEV(*Index)); + // Replace the I-th index with LHS. + IndexExprs[I] = SE->getSCEV(LHS); + const SCEV *CandidateExpr = SE->getGEPExpr( + GEP->getSourceElementType(), SE->getSCEV(GEP->getPointerOperand()), + IndexExprs, GEP->isInBounds()); + + auto *Candidate = findClosestMatchingDominator(CandidateExpr, GEP); + if (Candidate == nullptr) + return nullptr; + + PointerType *TypeOfCandidate = dyn_cast(Candidate->getType()); + // Pretty rare but theoretically possible when a numeric value happens to + // share CandidateExpr. + if (TypeOfCandidate == nullptr) + return nullptr; + + // NewGEP = (char *)Candidate + RHS * sizeof(IndexedType) + uint64_t IndexedSize = DL->getTypeAllocSize(IndexedType); + Type *ElementType = TypeOfCandidate->getElementType(); + uint64_t ElementSize = DL->getTypeAllocSize(ElementType); + // Another less rare case: because I is not necessarily the last index of the + // GEP, the size of the type at the I-th index (IndexedSize) is not + // necessarily divisible by ElementSize. For example, + // + // #pragma pack(1) + // struct S { + // int a[3]; + // int64 b[8]; + // }; + // #pragma pack() + // + // sizeof(S) = 100 is indivisible by sizeof(int64) = 8. + // + // TODO: bail out on this case for now. We could emit uglygep. + if (IndexedSize % ElementSize != 0) + return nullptr; + + // NewGEP = &Candidate[RHS * (sizeof(IndexedType) / sizeof(Candidate[0]))); + IRBuilder<> Builder(GEP); + Type *IntPtrTy = DL->getIntPtrType(TypeOfCandidate); + if (RHS->getType() != IntPtrTy) + RHS = Builder.CreateSExtOrTrunc(RHS, IntPtrTy); + if (IndexedSize != ElementSize) { + RHS = Builder.CreateMul( + RHS, ConstantInt::get(IntPtrTy, IndexedSize / ElementSize)); + } + GetElementPtrInst *NewGEP = + cast(Builder.CreateGEP(Candidate, RHS)); + NewGEP->setIsInBounds(GEP->isInBounds()); + NewGEP->takeName(GEP); + return NewGEP; +} + +Instruction *NaryReassociate::tryReassociateAdd(BinaryOperator *I) { Value *LHS = I->getOperand(0), *RHS = I->getOperand(1); if (auto *NewI = tryReassociateAdd(LHS, RHS, I)) return NewI; @@ -236,22 +448,34 @@ if (Pos == SeenExprs.end()) return nullptr; - auto &LHSCandidates = Pos->second; // Look for the closest dominator LHS of I that computes LHSExpr, and replace // I with LHS + RHS. - // - // Because we traverse the dominator tree in the pre-order, a + auto *LHS = findClosestMatchingDominator(LHSExpr, I); + if (LHS == nullptr) + return nullptr; + + Instruction *NewI = BinaryOperator::CreateAdd(LHS, RHS, "", I); + NewI->takeName(I); + return NewI; +} + +Instruction * +NaryReassociate::findClosestMatchingDominator(const SCEV *CandidateExpr, + Instruction *Dominatee) { + auto Pos = SeenExprs.find(CandidateExpr); + if (Pos == SeenExprs.end()) + return nullptr; + + auto &Candidates = Pos->second; + // Because we process the basic blocks in pre-order of the dominator tree, a // candidate that doesn't dominate the current instruction won't dominate any // future instruction either. Therefore, we pop it out of the stack. This // optimization makes the algorithm O(n). - while (!LHSCandidates.empty()) { - Instruction *LHS = LHSCandidates.back(); - if (DT->dominates(LHS, I)) { - Instruction *NewI = BinaryOperator::CreateAdd(LHS, RHS, "", I); - NewI->takeName(I); - return NewI; - } - LHSCandidates.pop_back(); + while (!Candidates.empty()) { + Instruction *Candidate = Candidates.back(); + if (DT->dominates(Candidate, Dominatee)) + return Candidate; + Candidates.pop_back(); } return nullptr; } Index: test/Transforms/NaryReassociate/NVPTX/nary-gep.ll =================================================================== --- /dev/null +++ test/Transforms/NaryReassociate/NVPTX/nary-gep.ll @@ -0,0 +1,77 @@ +; RUN: opt < %s -nary-reassociate -S | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-unknown-unknown" + +declare void @foo(float*) + +; foo(&a[i]); +; foo(&a[i + j]); +; => +; t = &a[i]; +; foo(t); +; foo(t + j); +define void @reassociate_gep(float* %a, i64 %i, i64 %j) { +; CHECK-LABEL: @reassociate_gep( + %1 = add i64 %i, %j + %2 = getelementptr float, float* %a, i64 %i +; CHECK: [[t1:[^ ]+]] = getelementptr float, float* %a, i64 %i + call void @foo(float* %2) +; CHECK: call void @foo(float* [[t1]]) + %3 = getelementptr float, float* %a, i64 %1 +; CHECK: [[t2:[^ ]+]] = getelementptr float, float* [[t1]], i64 %j + call void @foo(float* %3) +; CHECK: call void @foo(float* [[t2]]) + ret void +} + +; foo(&a[sext(j)]); +; foo(&a[sext(i +nsw j)]); +; => +; t = &a[sext(j)]; +; foo(t); +; foo(t + sext(i)); +define void @reassociate_gep_nsw(float* %a, i32 %i, i32 %j) { +; CHECK-LABEL: @reassociate_gep_nsw( + %1 = add nsw i32 %i, %j + %idxprom.1 = sext i32 %1 to i64 + %idxprom.j = sext i32 %j to i64 + %2 = getelementptr float, float* %a, i64 %idxprom.j +; CHECK: [[t1:[^ ]+]] = getelementptr float, float* %a, i64 %idxprom.j + call void @foo(float* %2) +; CHECK: call void @foo(float* [[t1]]) + %3 = getelementptr float, float* %a, i64 %idxprom.1 +; CHECK: [[sexti:[^ ]+]] = sext i32 %i to i64 +; CHECK: [[t2:[^ ]+]] = getelementptr float, float* [[t1]], i64 [[sexti]] + call void @foo(float* %3) +; CHECK: call void @foo(float* [[t2]]) + ret void +} + +; Do not split the second GEP because sext(i + j) != sext(i) + sext(j). +define void @reassociate_gep_no_nsw(float* %a, i32 %i, i32 %j) { +; CHECK-LABEL: @reassociate_gep_no_nsw( + %1 = add i32 %i, %j + %2 = getelementptr float, float* %a, i32 %j +; CHECK: getelementptr float, float* %a, i32 %j + call void @foo(float* %2) + %3 = getelementptr float, float* %a, i32 %1 +; CHECK: getelementptr float, float* %a, i32 %1 + call void @foo(float* %3) + ret void +} + +define void @reassociate_gep_128(float* %a, i128 %i, i128 %j) { +; CHECK-LABEL: @reassociate_gep_128( + %1 = add i128 %i, %j + %2 = getelementptr float, float* %a, i128 %i +; CHECK: [[t1:[^ ]+]] = getelementptr float, float* %a, i128 %i + call void @foo(float* %2) +; CHECK: call void @foo(float* [[t1]]) + %3 = getelementptr float, float* %a, i128 %1 +; CHECK: [[truncj:[^ ]+]] = trunc i128 %j to i64 +; CHECK: [[t2:[^ ]+]] = getelementptr float, float* [[t1]], i64 [[truncj]] + call void @foo(float* %3) +; CHECK: call void @foo(float* [[t2]]) + ret void +}