Index: include/llvm/CodeGen/TargetLowering.h =================================================================== --- include/llvm/CodeGen/TargetLowering.h +++ include/llvm/CodeGen/TargetLowering.h @@ -2236,6 +2236,11 @@ return false; } + // Return true if CodeGenPrepare should consider splitting large offset of a + // GEP to make the GEP fit into the addressing mode and can be sunk into the + // same blocks of its users. + virtual bool shouldConsiderGEPOffsetSplit() const { return false; } + //===--------------------------------------------------------------------===// // Runtime Library hooks // Index: lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- lib/CodeGen/CodeGenPrepare.cpp +++ lib/CodeGen/CodeGenPrepare.cpp @@ -215,6 +215,11 @@ "addr-sink-combine-scaled-reg", cl::Hidden, cl::init(true), cl::desc("Allow combining of ScaledReg field in Address sinking.")); +static cl::opt + EnableGEPOffsetSplit("cgp-split-large-offset-gep", cl::Hidden, + cl::init(true), + cl::desc("Enable splitting large offset of GEP.")); + namespace { using SetOfInstrs = SmallPtrSet; @@ -260,6 +265,14 @@ /// Keep track of sext chains based on their initial value. DenseMap SeenChainsForSExt; + // Keep track of GEPs accessing the same data structures such as structs or + // arrays that are candidates to be split later because of their large size. + DenseMap, 32>> + LargeOffsetGEPMap; + + // Keep track of new GEP base after splitting the GEPs having large offset. + SmallPtrSet NewGEPBases; + /// Keep track of SExt promoted. ValueToSExts ValToSExtendedUses; @@ -321,6 +334,7 @@ SmallVectorImpl &ProfitablyMovedExts, unsigned CreatedInstsCost = 0); bool mergeSExts(Function &F); + bool splitLargeGEPOffsets(); bool performAddressTypePromotion( Instruction *&Inst, bool AllowPromotionWithoutCommonHeader, @@ -414,6 +428,7 @@ SeenChainsForSExt.clear(); ValToSExtendedUses.clear(); RemovedInsts.clear(); + LargeOffsetGEPMap.clear(); for (Function::iterator I = F.begin(); I != F.end(); ) { BasicBlock *BB = &*I++; bool ModifiedDTOnIteration = false; @@ -425,6 +440,8 @@ } if (EnableTypePromotionMerge && !ValToSExtendedUses.empty()) MadeChange |= mergeSExts(F); + if (!LargeOffsetGEPMap.empty()) + MadeChange |= splitLargeGEPOffsets(); // Really free removed instructions during promotion. for (Instruction *I : RemovedInsts) @@ -2527,22 +2544,25 @@ /// The ongoing transaction where every action should be registered. TypePromotionTransaction &TPT; + // A GEP which has too large offset to be folded into the addressing mode. + std::pair &LargeOffsetGEP; + /// This is set to true when we should not do profitability checks. /// When true, IsProfitableToFoldIntoAddressingMode always returns true. bool IgnoreProfitability; AddressingModeMatcher(SmallVectorImpl &AMI, const TargetLowering &TLI, - const TargetRegisterInfo &TRI, - Type *AT, unsigned AS, + const TargetRegisterInfo &TRI, Type *AT, unsigned AS, Instruction *MI, ExtAddrMode &AM, const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts, - TypePromotionTransaction &TPT) + TypePromotionTransaction &TPT, + std::pair &LargeOffsetGEP) : AddrModeInsts(AMI), TLI(TLI), TRI(TRI), DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS), MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts), - PromotedInsts(PromotedInsts), TPT(TPT) { + PromotedInsts(PromotedInsts), TPT(TPT), LargeOffsetGEP(LargeOffsetGEP) { IgnoreProfitability = false; } @@ -2554,20 +2574,19 @@ /// optimizations. /// \p PromotedInsts maps the instructions to their type before promotion. /// \p The ongoing transaction where every action should be registered. - static ExtAddrMode Match(Value *V, Type *AccessTy, unsigned AS, - Instruction *MemoryInst, - SmallVectorImpl &AddrModeInsts, - const TargetLowering &TLI, - const TargetRegisterInfo &TRI, - const SetOfInstrs &InsertedInsts, - InstrToOrigTy &PromotedInsts, - TypePromotionTransaction &TPT) { + static ExtAddrMode + Match(Value *V, Type *AccessTy, unsigned AS, Instruction *MemoryInst, + SmallVectorImpl &AddrModeInsts, + const TargetLowering &TLI, const TargetRegisterInfo &TRI, + const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts, + TypePromotionTransaction &TPT, + std::pair &LargeOffsetGEP) { ExtAddrMode Result; - bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI, - AccessTy, AS, + bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI, AccessTy, AS, MemoryInst, Result, InsertedInsts, - PromotedInsts, TPT).matchAddr(V, 0); + PromotedInsts, TPT, LargeOffsetGEP) + .matchAddr(V, 0); (void)Success; assert(Success && "Couldn't select *anything*?"); return Result; } @@ -3745,6 +3764,28 @@ // Check to see if we can fold the base pointer in too. if (matchAddr(AddrInst->getOperand(0), Depth+1)) return true; + } else if (EnableGEPOffsetSplit && isa(AddrInst) && + TLI.shouldConsiderGEPOffsetSplit() && Depth == 0 && + ConstantOffset > 0) { + // Record GEPs with non-zero offsets as candidates for splitting in the + // event that the offset cannot fit into the r+i addressing mode. + // Simple and common case that only one GEP is used in calculating the + // address for the memory access. + Value *Base = AddrInst->getOperand(0); + auto *BaseI = dyn_cast(Base); + auto *GEP = cast(AddrInst); + if (isa(Base) || isa(Base) || + (BaseI && !isa(BaseI) && !isa(BaseI) && + !isa(BaseI))) { + // If the base is an instruction, make sure the GEP is not in the same + // basic block as the base. If the base is an argument or global + // value, make sure the GEP is not in the entry block. Otherwise, + // instruction selection can undo the split. + BasicBlock *Parent = + BaseI ? BaseI->getParent() : &GEP->getFunction()->getEntryBlock(); + if (GEP->getParent() != Parent) + LargeOffsetGEP = std::make_pair(GEP, ConstantOffset); + } } AddrMode.BaseOffs -= ConstantOffset; return false; @@ -4155,12 +4196,12 @@ // will tell us if the addressing mode for the memory operation will // *actually* cover the shared instruction. ExtAddrMode Result; + std::pair LargeOffsetGEP(nullptr, 0); TypePromotionTransaction::ConstRestorationPt LastKnownGood = TPT.getRestorationPoint(); - AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, TRI, - AddressAccessTy, AS, - MemoryInst, Result, InsertedInsts, - PromotedInsts, TPT); + AddressingModeMatcher Matcher( + MatchedAddrModeInsts, TLI, TRI, AddressAccessTy, AS, MemoryInst, Result, + InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP); Matcher.IgnoreProfitability = true; bool Success = Matcher.matchAddr(Address, 0); (void)Success; assert(Success && "Couldn't select *anything*?"); @@ -4262,11 +4303,20 @@ // the result may differ depending on what other uses our candidate // addressing instructions might have. AddrModeInsts.clear(); + std::pair LargeOffsetGEP(nullptr, 0); ExtAddrMode NewAddrMode = AddressingModeMatcher::Match( V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *TRI, - InsertedInsts, PromotedInsts, TPT); - NewAddrMode.OriginalValue = V; + InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP); + + GetElementPtrInst *GEP = LargeOffsetGEP.first; + if (GEP && GEP->getParent() != MemoryInst->getParent() && + !NewGEPBases.count(LargeOffsetGEP.first)) + // If splitting the underlying data structure can reduce the offset of a + // GEP, collect the GEP. Skip the GEPs that are the new bases of + // previously split data structures. + LargeOffsetGEPMap[GEP->getPointerOperand()].push_back(LargeOffsetGEP); + NewAddrMode.OriginalValue = V; if (!AddrModes.addNewAddrMode(NewAddrMode)) break; } @@ -4775,6 +4825,149 @@ return Changed; } +// Compare the constant offsets of two GEPs. +static int +compareGEPOffset(const std::pair *LHS, + const std::pair *RHS) { + if (LHS->first == RHS->first) + return 0; + if (LHS->second != RHS->second) + return LHS->second > RHS->second ? 1 : -1; + return LHS->first > RHS->first ? 1 : -1; +} + +// Spliting large data structures so that the GEPs accessing them can have +// smaller offsets so that they can be sunk to the same blocks as their users. +// For example, a large struct starting from %base is splitted into two parts +// where the second part starts from %new_base. +// +// Before: +// BB0: +// %base = +// +// BB1: +// %gep0 = gep %base, off0 +// %gep1 = gep %base, off1 +// %gep2 = gep %base, off2 +// +// BB2: +// %load1 = load %gep0 +// %load2 = load %gep1 +// %load3 = load %gep2 +// +// After: +// BB0: +// %base = +// %new_base = gep %base, off0 +// +// BB1: +// %new_gep0 = %new_base +// %new_gep1 = gep %new_base, off1 - off0 +// %new_gep2 = gep %new_base, off2 - off0 +// +// BB2: +// %load1 = load i32, i32* %new_gep0 +// %load2 = load i32, i32* %new_gep1 +// %load3 = load i32, i32* %new_gep2 +// +// %new_gep1 and %new_gep2 can be sunk to BB2 now after the splitting because +// their offsets are smaller enough to fit into the addressing mode. +bool CodeGenPrepare::splitLargeGEPOffsets() { + bool Changed = false; + for (auto &Entry : LargeOffsetGEPMap) { + SmallVectorImpl> &LargeOffsetGEPs = + Entry.second; + // Sorting all the GEPs of the same data structures based on the offsets. + array_pod_sort(LargeOffsetGEPs.begin(), LargeOffsetGEPs.end(), + compareGEPOffset); + LargeOffsetGEPs.erase( + std::unique(LargeOffsetGEPs.begin(), LargeOffsetGEPs.end()), + LargeOffsetGEPs.end()); + // Skip if all the GEPs have the same offsets. + if (LargeOffsetGEPs.front().second == LargeOffsetGEPs.back().second) + continue; + GetElementPtrInst *BaseGEP = LargeOffsetGEPs.begin()->first; + int64_t BaseOffset = LargeOffsetGEPs.begin()->second; + Value *NewBaseGEP = nullptr; + for (auto &LargeOffsetGEP : LargeOffsetGEPs) { + GetElementPtrInst *GEP = LargeOffsetGEP.first; + int64_t Offset = LargeOffsetGEP.second; + if (Offset != BaseOffset) { + TargetLowering::AddrMode AddrMode; + AddrMode.BaseOffs = Offset - BaseOffset; + if (!TLI->isLegalAddressingMode(*DL, AddrMode, + GEP->getResultElementType(), + GEP->getAddressSpace())) { + // We need to create a new base if the offset to the current base is + // too large to fit into the addressing mode. So, a very large struct + // may be splitted into several parts. + BaseGEP = GEP; + BaseOffset = Offset; + NewBaseGEP = nullptr; + } + } + + // Generate a new GEP to replace the current one. + IRBuilder<> Builder(GEP); + Type *IntPtrTy = DL->getIntPtrType(GEP->getType()); + Type *I8PtrTy = + Builder.getInt8PtrTy(GEP->getType()->getPointerAddressSpace()); + Type *I8Ty = Builder.getInt8Ty(); + + if (!NewBaseGEP) { + // Create a new base if we don't have one yet. Find the insertion + // pointer for the new base first. + BasicBlock::iterator NewBaseInsertPt; + BasicBlock *NewBaseInsertBB; + if (auto *BaseI = dyn_cast(Entry.first)) { + // If the base of the struct is an instruction, the new base will be + // inserted close to it. + NewBaseInsertBB = BaseI->getParent(); + if (isa(BaseI)) + NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt(); + else if (InvokeInst *Invoke = dyn_cast(BaseI)) { + NewBaseInsertBB = + SplitEdge(NewBaseInsertBB, Invoke->getNormalDest()); + NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt(); + } else + NewBaseInsertPt = std::next(BaseI->getIterator()); + } else { + // If the current base is an argument or global value, the new base + // will be inserted to the entry block. + NewBaseInsertBB = &BaseGEP->getFunction()->getEntryBlock(); + NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt(); + } + IRBuilder<> NewBaseBuilder(NewBaseInsertBB, NewBaseInsertPt); + // Create a new base. + Value *BaseIndex = ConstantInt::get(IntPtrTy, BaseOffset); + NewBaseGEP = Entry.first; + if (NewBaseGEP->getType() != I8PtrTy) + NewBaseGEP = NewBaseBuilder.CreatePointerCast(NewBaseGEP, I8PtrTy); + NewBaseGEP = + NewBaseBuilder.CreateGEP(I8Ty, NewBaseGEP, BaseIndex, "splitgep"); + NewGEPBases.insert(NewBaseGEP); + } + + Value *NewGEP = NewBaseGEP; + if (Offset == BaseOffset) { + if (GEP->getType() != I8PtrTy) + NewGEP = Builder.CreatePointerCast(NewGEP, GEP->getType()); + } else { + // Calculate the new offset for the new GEP. + Value *Index = ConstantInt::get(IntPtrTy, Offset - BaseOffset); + NewGEP = Builder.CreateGEP(I8Ty, NewBaseGEP, Index); + + if (GEP->getType() != I8PtrTy) + NewGEP = Builder.CreatePointerCast(NewGEP, GEP->getType()); + } + GEP->replaceAllUsesWith(NewGEP); + GEP->eraseFromParent(); + Changed = true; + } + } + return Changed; +} + /// Return true, if an ext(load) can be formed from an extension in /// \p MovedExts. bool CodeGenPrepare::canFormExtLd( Index: lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.h +++ lib/Target/AArch64/AArch64ISelLowering.h @@ -335,6 +335,8 @@ bool isLegalAddImmediate(int64_t) const override; bool isLegalICmpImmediate(int64_t) const override; + bool shouldConsiderGEPOffsetSplit() const override; + EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const override; Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8214,6 +8214,11 @@ return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes); } +bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const { + // Consider splitting large offset of struct or array. + return true; +} + int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const { Index: test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll =================================================================== --- /dev/null +++ test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll @@ -0,0 +1,147 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s + +%struct_type = type { [10000 x i32], i32, i32 } + +define void @test1(%struct_type** %s, i32 %n) { +; CHECK-LABEL: test1 +entry: + %struct = load %struct_type*, %struct_type** %s + br label %while_cond + +while_cond: + %phi = phi i32 [ 0, %entry ], [ %i, %while_body ] +; CHECK: mov w{{[0-9]+}}, #40000 +; CHECK-NOT: mov w{{[0-9]+}}, #40004 + %gep0 = getelementptr %struct_type, %struct_type* %struct, i64 0, i32 1 + %gep1 = getelementptr %struct_type, %struct_type* %struct, i64 0, i32 2 + %cmp = icmp slt i32 %phi, %n + br i1 %cmp, label %while_body, label %while_end + +while_body: +; CHECK: str w{{[0-9]+}}, [x{{[0-9]+}}, #4] + %i = add i32 %phi, 1 + store i32 %i, i32* %gep0 + store i32 %phi, i32* %gep1 + br label %while_cond + +while_end: + ret void +} + +define void @test2(%struct_type* %struct, i32 %n) { +; CHECK-LABEL: test2 +entry: + %cmp = icmp eq %struct_type* %struct, null + br i1 %cmp, label %while_end, label %while_cond + +while_cond: + %phi = phi i32 [ 0, %entry ], [ %i, %while_body ] +; CHECK: mov w{{[0-9]+}}, #40000 +; CHECK-NOT: mov w{{[0-9]+}}, #40004 + %gep0 = getelementptr %struct_type, %struct_type* %struct, i64 0, i32 1 + %gep1 = getelementptr %struct_type, %struct_type* %struct, i64 0, i32 2 + %cmp1 = icmp slt i32 %phi, %n + br i1 %cmp1, label %while_body, label %while_end + +while_body: +; CHECK: str w{{[0-9]+}}, [x{{[0-9]+}}, #4] + %i = add i32 %phi, 1 + store i32 %i, i32* %gep0 + store i32 %phi, i32* %gep1 + br label %while_cond + +while_end: + ret void +} + +define void @test3(%struct_type* %s1, %struct_type* %s2, i1 %cond, i32 %n) { +; CHECK-LABEL: test3 +entry: + br i1 %cond, label %if_true, label %if_end + +if_true: + br label %if_end + +if_end: + %struct = phi %struct_type* [ %s1, %entry ], [ %s2, %if_true ] + %cmp = icmp eq %struct_type* %struct, null + br i1 %cmp, label %while_end, label %while_cond + +while_cond: + %phi = phi i32 [ 0, %if_end ], [ %i, %while_body ] +; CHECK: mov w{{[0-9]+}}, #40000 +; CHECK-NOT: mov w{{[0-9]+}}, #40004 + %gep0 = getelementptr %struct_type, %struct_type* %struct, i64 0, i32 1 + %gep1 = getelementptr %struct_type, %struct_type* %struct, i64 0, i32 2 + %cmp1 = icmp slt i32 %phi, %n + br i1 %cmp1, label %while_body, label %while_end + +while_body: +; CHECK: str w{{[0-9]+}}, [x{{[0-9]+}}, #4] + %i = add i32 %phi, 1 + store i32 %i, i32* %gep0 + store i32 %phi, i32* %gep1 + br label %while_cond + +while_end: + ret void +} + +declare %struct_type* @foo() + +define void @test4(i32 %n) personality i32 (...)* @__FrameHandler { +; CHECK-LABEL: test4 +entry: + %struct = invoke %struct_type* @foo() to label %while_cond unwind label %cleanup + +while_cond: + %phi = phi i32 [ 0, %entry ], [ %i, %while_body ] +; CHECK: mov w{{[0-9]+}}, #40000 +; CHECK-NOT: mov w{{[0-9]+}}, #40004 + %gep0 = getelementptr %struct_type, %struct_type* %struct, i64 0, i32 1 + %gep1 = getelementptr %struct_type, %struct_type* %struct, i64 0, i32 2 + %cmp = icmp slt i32 %phi, %n + br i1 %cmp, label %while_body, label %while_end + +while_body: +; CHECK: str w{{[0-9]+}}, [x{{[0-9]+}}, #4] + %i = add i32 %phi, 1 + store i32 %i, i32* %gep0 + store i32 %phi, i32* %gep1 + br label %while_cond + +while_end: + ret void + +cleanup: + landingpad { i8*, i32 } cleanup + unreachable +} + +declare i32 @__FrameHandler(...) + +define void @test5([65536 x i32]** %s, i32 %n) { +; CHECK-LABEL: test5 +entry: + %struct = load [65536 x i32]*, [65536 x i32]** %s + br label %while_cond + +while_cond: + %phi = phi i32 [ 0, %entry ], [ %i, %while_body ] +; CHECK: mov w{{[0-9]+}}, #14464 +; CHECK-NOT: mov w{{[0-9]+}}, #14468 + %gep0 = getelementptr [65536 x i32], [65536 x i32]* %struct, i64 0, i32 20000 + %gep1 = getelementptr [65536 x i32], [65536 x i32]* %struct, i64 0, i32 20001 + %cmp = icmp slt i32 %phi, %n + br i1 %cmp, label %while_body, label %while_end + +while_body: +; CHECK: str w{{[0-9]+}}, [x{{[0-9]+}}, #4] + %i = add i32 %phi, 1 + store i32 %i, i32* %gep0 + store i32 %phi, i32* %gep1 + br label %while_cond + +while_end: + ret void +}