Index: lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp =================================================================== --- lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -156,6 +156,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Constants.h" @@ -320,6 +321,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.setPreservesCFG(); } @@ -379,6 +381,9 @@ const DataLayout *DL; const DominatorTree *DT; const TargetMachine *TM; + + LoopInfo *LI; + DenseMap NumOfCommonBaseGEPs; /// Whether to lower a GEP with multiple indices into arithmetic operations or /// multiple GEPs with a single index. bool LowerGEP; @@ -734,10 +739,48 @@ Type *I8PtrTy = Builder.getInt8PtrTy(Variadic->getType()->getPointerAddressSpace()); Value *ResultPtr = Variadic->getOperand(0); + Loop *L = LI->getLoopFor(Variadic->getParent()); + bool GenerateConstFirst = false; + + gep_type_iterator GTI = gep_type_begin(*Variadic); + if (L && L->isLoopInvariant(ResultPtr) && + NumOfCommonBaseGEPs[ResultPtr] == 1) { + + // We perfer generate GEP with constant offset first if the other GEPs + // has loop variant operand and it deoesn't have foldable constant. + for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) { + if (isa(*GTI)) { + Value *Idx = Variadic->getOperand(I); + // Skip constant + if (dyn_cast(Idx)) + continue; + + if (L && L->isLoopInvariant(Idx)) + continue; + + GenerateConstFirst = true; + // If the other offset contain constant too, it could be folded later, + // So we keep it in original order. + if (BinaryOperator *BO = dyn_cast(Idx)) + if (dyn_cast(BO->getOperand(0)) || + dyn_cast(BO->getOperand(1))) + GenerateConstFirst = false; + } + } + } + if (ResultPtr->getType() != I8PtrTy) ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy); - gep_type_iterator GTI = gep_type_begin(*Variadic); + // Create a GEP with the constant offset index. + // Create this first if base is loop invariant, and base is only used + // by one GEP, that way it could be moved out of loop by later optimizations + if (AccumulativeByteOffset != 0 && GenerateConstFirst) { + Value *Offset = ConstantInt::get(IntPtrTy, AccumulativeByteOffset); + ResultPtr = Builder.CreateGEP(ResultPtr, Offset, "uglygep"); + } + + GTI = gep_type_begin(*Variadic); // Create an ugly GEP for each sequential index. We don't create GEPs for // structure indices, as they are accumulated in the constant offset index. for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) { @@ -766,7 +809,8 @@ } // Create a GEP with the constant offset index. - if (AccumulativeByteOffset != 0) { + // Only need to do that if din't do that earlier. + if (AccumulativeByteOffset != 0 && !GenerateConstFirst) { Value *Offset = ConstantInt::get(IntPtrTy, AccumulativeByteOffset); ResultPtr = Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Offset, "uglygep"); @@ -1008,16 +1052,27 @@ return false; DT = &getAnalysis().getDomTree(); - + LI = &getAnalysis().getLoopInfo(); bool Changed = false; + DenseMap LoopVisited; for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) { - for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ) { - if (GetElementPtrInst *GEP = dyn_cast(I++)) { - Changed |= splitGEP(GEP); - } - // No need to split GEP ConstantExprs because all its indices are constant - // already. + + // Record number of times the base of a GEP used inside loop. + NumOfCommonBaseGEPs.clear(); + Loop *L = LI->getLoopFor(B); + if (L && !LoopVisited[L]) { + LoopVisited[L] = true; + for (auto BB : L->getBlocks()) + for (BasicBlock::iterator I = BB->begin(), IE = BB->end(); I != IE;) + if (GetElementPtrInst *GEP = dyn_cast(I++)) + NumOfCommonBaseGEPs[GEP->getOperand(0)]++; } + + for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE;) + if (GetElementPtrInst *GEP = dyn_cast(I++)) + Changed |= splitGEP(GEP); + // No need to split GEP ConstantExprs because all its indices are constant + // already. } if (VerifyNoDeadCode) Index: test/CodeGen/AArch64/aarch64-loop-gep-opt.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/aarch64-loop-gep-opt.ll @@ -0,0 +1,50 @@ +; RUN: llc -O3 -aarch64-gep-opt=true -print-after=codegenprepare -mcpu=cortex-a53 < %s >%t 2>&1 && FileCheck <%t %s +; REQUIRES: asserts +target triple = "aarch64--linux-android" + +%typeD = type { i32, i32, [256 x i32], [257 x i32] } + +; Function Attrs: noreturn nounwind uwtable +define i32 @test1(%typeD* nocapture %s) { +entry: +; CHECK-LABEL: entry: +; CHECK: %uglygep = getelementptr i8, i8* %0, i64 1032 +; CHECK: br label %do.body.i + + + %tPos = getelementptr inbounds %typeD, %typeD* %s, i64 0, i32 0 + %k0 = getelementptr inbounds %typeD, %typeD* %s, i64 0, i32 1 + %.pre = load i32, i32* %tPos, align 4 + br label %do.body.i + +do.body.i: +; CHECK-LABEL: do.body.i: +; CHECK: %uglygep2 = getelementptr i8, i8* %uglygep, i64 %3 +; CHECK-NEXT: %4 = bitcast i8* %uglygep2 to i32* +; CHECK-NOT: %uglygep2 = getelementptr i8, i8* %uglygep, i64 1032 + + + %0 = phi i32 [ 256, %entry ], [ %.be, %do.body.i.backedge ] + %1 = phi i32 [ 0, %entry ], [ %.be6, %do.body.i.backedge ] + %add.i = add nsw i32 %1, %0 + %shr.i = ashr i32 %add.i, 1 + %idxprom.i = sext i32 %shr.i to i64 + %arrayidx.i = getelementptr inbounds %typeD, %typeD* %s, i64 0, i32 3, i64 %idxprom.i + %2 = load i32, i32* %arrayidx.i, align 4 + %cmp.i = icmp sle i32 %2, %.pre + %na.1.i = select i1 %cmp.i, i32 %0, i32 %shr.i + %nb.1.i = select i1 %cmp.i, i32 %shr.i, i32 %1 + %sub.i = sub nsw i32 %na.1.i, %nb.1.i + %cmp1.i = icmp eq i32 %sub.i, 1 + br i1 %cmp1.i, label %fooo.exit, label %do.body.i.backedge + +do.body.i.backedge: + %.be = phi i32 [ %na.1.i, %do.body.i ], [ 256, %fooo.exit ] + %.be6 = phi i32 [ %nb.1.i, %do.body.i ], [ 0, %fooo.exit ] + br label %do.body.i + +fooo.exit: ; preds = %do.body.i + store i32 %nb.1.i, i32* %k0, align 4 + br label %do.body.i.backedge +} +