diff --git a/llvm/include/llvm/Analysis/LoopUnrollAnalyzer.h b/llvm/include/llvm/Analysis/LoopUnrollAnalyzer.h --- a/llvm/include/llvm/Analysis/LoopUnrollAnalyzer.h +++ b/llvm/include/llvm/Analysis/LoopUnrollAnalyzer.h @@ -83,6 +83,10 @@ bool simplifyInstWithSCEV(Instruction *I); + /// Try to simplify an address computation outside of the loop body, so we can + /// compare it with simplified addresses in the loop. + void simplifyNonLoopAddress(Value *V); + bool visitInstruction(Instruction &I) { return simplifyInstWithSCEV(&I); } bool visitBinaryOperator(BinaryOperator &I); bool visitLoad(LoadInst &I); diff --git a/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp b/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp --- a/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp +++ b/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp @@ -60,6 +60,32 @@ return false; } +void UnrolledInstAnalyzer::simplifyNonLoopAddress(Value *V) { + Instruction *Inst = dyn_cast(V); + // For now we only try to simplify address computation instructions outside + // the current loop. + if (!Inst || !Inst->getType()->isPointerTy() || + !SE.isSCEVable(Inst->getType()) || L->contains(Inst)) + return; + auto Iter = SimplifiedAddresses.find(Inst); + if (Iter != SimplifiedAddresses.end()) + return; + + const SCEV *S = SE.getSCEV(Inst); + // Check if the offset from the base address becomes a constant. + auto *Base = dyn_cast(SE.getPointerBase(S)); + if (!Base) + return; + auto *Offset = dyn_cast(SE.getMinusSCEV(S, Base)); + if (!Offset) + return; + + SimplifiedAddress Address; + Address.Base = Base->getValue(); + Address.Offset = Offset->getValue(); + SimplifiedAddresses[V] = Address; +} + /// Try to simplify binary operator I. /// /// TODO: Probably it's worth to hoist the code for estimating the @@ -175,6 +201,8 @@ RHS = SimpleRHS; if (!isa(LHS) && !isa(RHS)) { + simplifyNonLoopAddress(LHS); + simplifyNonLoopAddress(RHS); auto SimplifiedLHS = SimplifiedAddresses.find(LHS); if (SimplifiedLHS != SimplifiedAddresses.end()) { auto SimplifiedRHS = SimplifiedAddresses.find(RHS); diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -423,9 +423,10 @@ // First accumulate the cost of this instruction. if (!Cost.IsFree) { - UnrolledCost += TTI.getUserCost(I); + unsigned UserCost = TTI.getUserCost(I); + UnrolledCost += UserCost; LLVM_DEBUG(dbgs() << "Adding cost of instruction (iteration " - << Iteration << "): "); + << Iteration << "): " << UserCost << " "); LLVM_DEBUG(I->dump()); } diff --git a/llvm/test/Transforms/LoopUnroll/unrolled-inst-analyzer-pointers.ll b/llvm/test/Transforms/LoopUnroll/unrolled-inst-analyzer-pointers.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/unrolled-inst-analyzer-pointers.ll @@ -0,0 +1,58 @@ +; RUN: opt -loop-unroll -debug < %s 2>&1 | FileCheck %s + +; REQUIRES: asserts + +; Check that we simplify pointers outside of the loop for comparisons. +; %cmp1.i.i is free when unrolling. + +define i32 @test_2_iters(i32 %a, i32 %b, i32 %c, i32 %d) optsize { +; CHECK-LABEL: Loop Unroll: F[test_2_iters] Loop %loop +; CHECK-NEXT: Loop Size = 7 +; CHECK-NEXT: Starting LoopUnroll profitability analysis... +; CHECK-NEXT: Analyzing iteration 0 +; CHECK-NEXT: Analyzing iteration 1 +; CHECK-NEXT: Adding cost of instruction (iteration 1): 1 %spec.select.i.i = select i1 %cmp.i.i.i, i32* %incdec.ptr.i.i8, i32* %spec.select.i.i7 +; CHECK-NEXT: Adding cost of instruction (iteration 1): 1 %cmp.i.i.i = icmp slt i32 %.pre, %.pre3 +; CHECK-NEXT: Adding cost of instruction (iteration 1): 1 %.pre3 = load i32, i32* %incdec.ptr.i.i8, align 4 +; CHECK-NEXT: Adding cost of instruction (iteration 1): 1 %.pre = load i32, i32* %spec.select.i.i7, align 4 +; CHECK-NEXT: Adding cost of instruction (iteration 0): 1 %incdec.ptr.i.i = getelementptr inbounds i32, i32* %incdec.ptr.i.i8, i64 1 +; CHECK-NEXT: Adding cost of instruction (iteration 0): 1 %spec.select.i.i = select i1 %cmp.i.i.i, i32* %incdec.ptr.i.i8, i32* %spec.select.i.i7 +; CHECK-NEXT: Adding cost of instruction (iteration 0): 1 %cmp.i.i.i = icmp slt i32 %.pre, %.pre3 +; CHECK-NEXT: Adding cost of instruction (iteration 0): 1 %.pre3 = load i32, i32* %incdec.ptr.i.i8, align 4 +; CHECK-NEXT: Adding cost of instruction (iteration 0): 1 %.pre = load i32, i32* %spec.select.i.i7, align 4 +; CHECK-NEXT: Analysis finished: +; CHECK-NEXT: UnrolledCost: 9, RolledDynamicCost: 14 + + +entry: + %ref.tmp = alloca [4 x i32], align 4 + %0 = bitcast [4 x i32]* %ref.tmp to i8* + %arrayinit.begin = getelementptr inbounds [4 x i32], [4 x i32]* %ref.tmp, i64 0, i64 0 + store i32 %a, i32* %arrayinit.begin, align 4 + %arrayinit.element = getelementptr inbounds [4 x i32], [4 x i32]* %ref.tmp, i64 0, i64 1 + store i32 %b, i32* %arrayinit.element, align 4 + %arrayinit.element1 = getelementptr inbounds [4 x i32], [4 x i32]* %ref.tmp, i64 0, i64 2 + store i32 %c, i32* %arrayinit.element1, align 4 + %arrayinit.element2 = getelementptr inbounds [4 x i32], [4 x i32]* %ref.tmp, i64 0, i64 3 + store i32 %d, i32* %arrayinit.element2, align 4 + %add.ptr.i.i = getelementptr inbounds [4 x i32], [4 x i32]* %ref.tmp, i64 0, i64 4 + %cmp.i.i.i4 = icmp slt i32 %a, %b + %spec.select.i.i5 = select i1 %cmp.i.i.i4, i32* %arrayinit.element, i32* %arrayinit.begin + %incdec.ptr.i.i6 = getelementptr inbounds [4 x i32], [4 x i32]* %ref.tmp, i64 0, i64 2 + br label %loop + +loop: ; preds = %entry, %loop + %incdec.ptr.i.i8 = phi i32* [ %incdec.ptr.i.i6, %entry ], [ %incdec.ptr.i.i, %loop ] + %spec.select.i.i7 = phi i32* [ %spec.select.i.i5, %entry ], [ %spec.select.i.i, %loop ] + %.pre = load i32, i32* %spec.select.i.i7, align 4 + %.pre3 = load i32, i32* %incdec.ptr.i.i8, align 4 + %cmp.i.i.i = icmp slt i32 %.pre, %.pre3 + %spec.select.i.i = select i1 %cmp.i.i.i, i32* %incdec.ptr.i.i8, i32* %spec.select.i.i7 + %incdec.ptr.i.i = getelementptr inbounds i32, i32* %incdec.ptr.i.i8, i64 1 + %cmp1.i.i = icmp eq i32* %incdec.ptr.i.i, %add.ptr.i.i + br i1 %cmp1.i.i, label %exit, label %loop + +exit: ; preds = %loop + %1 = load i32, i32* %spec.select.i.i, align 4 + ret i32 %1 +}