Index: lib/Analysis/InlineCost.cpp =================================================================== --- lib/Analysis/InlineCost.cpp +++ lib/Analysis/InlineCost.cpp @@ -163,6 +163,13 @@ /// Keep track of values which map to a pointer base and constant offset. DenseMap> ConstantOffsetPtrs; + /// Model the elimination of repeated loads that is expected to happen + /// whenever we simplify away the stores that would otherwise cause them to be + /// loads. + bool EnableLoadElimination; + SmallPtrSet LoadAddrSet; + int LoadEliminationCost; + // Custom simplification helper routines. bool isAllocaDerivedArg(Value *V); bool lookupSROAArgAndCost(Value *V, Value *&Arg, @@ -171,6 +178,7 @@ void disableSROA(Value *V); void accumulateSROACost(DenseMap::iterator CostIt, int InstructionCost); + void disableLoadElimination(); bool isGEPFree(GetElementPtrInst &GEP); bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); bool simplifyCallSite(Function *F, CallSite CS); @@ -264,10 +272,10 @@ ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false), HasFrameEscape(false), AllocatedSize(0), NumInstructions(0), NumVectorInstructions(0), VectorBonus(0), SingleBBBonus(0), - NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), - NumConstantPtrCmps(0), NumConstantPtrDiffs(0), - NumInstructionsSimplified(0), SROACostSavings(0), - SROACostSavingsLost(0) {} + EnableLoadElimination(true), LoadEliminationCost(0), NumConstantArgs(0), + NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0), + NumConstantPtrDiffs(0), NumInstructionsSimplified(0), + SROACostSavings(0), SROACostSavingsLost(0) {} bool analyzeCall(CallSite CS); @@ -322,6 +330,7 @@ SROACostSavings -= CostIt->second; SROACostSavingsLost += CostIt->second; SROAArgCosts.erase(CostIt); + disableLoadElimination(); } /// \brief If 'V' maps to a SROA candidate, disable SROA for it. @@ -339,6 +348,13 @@ SROACostSavings += InstructionCost; } +void CallAnalyzer::disableLoadElimination() { + if (EnableLoadElimination) { + Cost += LoadEliminationCost; + EnableLoadElimination = false; + } +} + /// \brief Accumulate a constant GEP offset into an APInt if possible. /// /// Returns false if unable to compute the offset for any reason. Respects any @@ -992,6 +1008,15 @@ disableSROA(CostIt); } + // If the data is already loaded from this address and hasn't been clobbered + // by any stores or calls, this load is likely to be redundant and can be + // eliminated. + if (EnableLoadElimination && + !LoadAddrSet.insert(I.getPointerOperand()).second) { + LoadEliminationCost += InlineConstants::InstrCost; + return true; + } + return false; } @@ -1007,6 +1032,15 @@ disableSROA(CostIt); } + // The store can potentially clobber loads and prevent repeated loads from + // being eliminated. + // FIXME: + // 1. We can probably keep an initial set of eliminatable loads substracted + // from the cost even when we finally see a store. We just need to disable + // *further* accumulation of elimination savings. + // 2. We should probably at some point thread MemorySSA for the callee into + // this and then use that to actually compute *really* precise savings. + disableLoadElimination(); return false; } @@ -1089,6 +1123,8 @@ if (IntrinsicInst *II = dyn_cast(CS.getInstruction())) { switch (II->getIntrinsicID()) { default: + if (!CS.onlyReadsMemory()) + disableLoadElimination(); return Base::visitCallSite(CS); case Intrinsic::load_relative: @@ -1099,6 +1135,7 @@ case Intrinsic::memset: case Intrinsic::memcpy: case Intrinsic::memmove: + disableLoadElimination(); // SROA can usually chew through these intrinsics, but they aren't free. return false; case Intrinsic::localescape: @@ -1125,6 +1162,8 @@ Cost += InlineConstants::CallPenalty; } + if (!CS.onlyReadsMemory()) + disableLoadElimination(); return Base::visitCallSite(CS); } @@ -1139,8 +1178,11 @@ // Next, check if this happens to be an indirect function call to a known // function in this inline context. If not, we've done all we can. Function *F = dyn_cast_or_null(SimplifiedValues.lookup(Callee)); - if (!F) + if (!F) { + if (!CS.onlyReadsMemory()) + disableLoadElimination(); return Base::visitCallSite(CS); + } // If we have a constant that we are calling as a function, we can peer // through it and see the function target. This happens not infrequently @@ -1157,6 +1199,8 @@ Cost -= std::max(0, CA.getThreshold() - CA.getCost()); } + if (!F->onlyReadsMemory()) + disableLoadElimination(); return Base::visitCallSite(CS); } @@ -1631,6 +1675,7 @@ DEBUG_PRINT_STAT(NumInstructions); DEBUG_PRINT_STAT(SROACostSavings); DEBUG_PRINT_STAT(SROACostSavingsLost); + DEBUG_PRINT_STAT(LoadEliminationCost); DEBUG_PRINT_STAT(ContainsNoDuplicateCall); DEBUG_PRINT_STAT(Cost); DEBUG_PRINT_STAT(Threshold); Index: test/Transforms/Inline/redundant-loads.ll =================================================================== --- /dev/null +++ test/Transforms/Inline/redundant-loads.ll @@ -0,0 +1,142 @@ +; RUN: opt -inline < %s -S -o - -inline-threshold=3 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare void @pad() readnone + +define void @outer1(i32* %a) { +; CHECK-LABEL: @outer1( +; CHECK-NOT: call void @inner1 + %b = alloca i32 + call void @inner1(i32* %a, i32* %b) + ret void +} + +define void @inner1(i32* %a, i32* %b) { + %1 = load i32, i32* %a + store i32 %1, i32 * %b ; This store does not clobber the first load. + %2 = load i32, i32* %a + call void @pad() + %3 = load i32, i32* %a + ret void +} + + +define void @outer2(i32* %a, i32* %b) { +; CHECK-LABEL: @outer2( +; CHECK: call void @inner2 + call void @inner2(i32* %a, i32* %b) + ret void +} + +define void @inner2(i32* %a, i32* %b) { + %1 = load i32, i32* %a + store i32 %1, i32 * %b ; This store clobbers the first load. + %2 = load i32, i32* %a + call void @pad() + ret void +} + + +define void @outer3(i32* %a) { +; CHECK-LABEL: @outer3( +; CHECK: call void @inner3 + call void @inner3(i32* %a) + ret void +} + +declare void @ext() + +define void @inner3(i32* %a) { + %1 = load i32, i32* %a + call void @ext() ; This call clobbers the first load. + %2 = load i32, i32* %a + ret void +} + + +define void @outer4(i32* %a, i32* %b, i32* %c) { +; CHECK-LABEL: @outer4( +; CHECK-NOT: call void @inner4 + call void @inner4(i32* %a, i32* %b, i1 false) + ret void +} + +define void @inner4(i32* %a, i32* %b, i1 %pred) { + %1 = load i32, i32* %a + br i1 %pred, label %cond_true, label %cond_false + +cond_true: + store i32 %1, i32 * %b ; This store does not clobber the first load. + br label %cond_false + +cond_false: + %2 = load i32, i32* %a + call void @pad() + %3 = load i32, i32* %a + %4 = load i32, i32* %a + ret void +} + + +define void @outer5(i32* %a, double %b) { +; CHECK-LABEL: @outer5( +; CHECK-NOT: call void @inner5 + call void @inner5(i32* %a, double %b) + ret void +} + +declare double @llvm.fabs.f64(double) nounwind readnone + +define void @inner5(i32* %a, double %b) { + %1 = load i32, i32* %a + %2 = call double @llvm.fabs.f64(double %b) ; This intrinsic does not clobber the first load. + %3 = load i32, i32* %a + call void @pad() + ret void +} + + +define void @outer6(i32* %a) { +; CHECK-LABEL: @outer6( +; CHECK-NOT: call void @inner6 + call void @inner6(i32* %a) + ret void +} + +declare void @ext2() readnone + +define void @inner6(i32* %a) { + %1 = load i32, i32* %a + call void @ext2() ; This call does not clobber the first load. + %2 = load i32, i32* %a + ret void +} + + +define void @outer7(i32* %a) { +; CHECK-LABEL: @outer7( +; CHECK-NOT: call void @inner7 + call void @inner7(i32* %a, void ()* @ext2) + ret void +} + +define void @inner7(i32* %a, void ()* %f) { + %1 = load i32, i32* %a + call void %f() ; This indirect call does not clobber the first load. + %2 = load i32, i32* %a + call void @pad() + call void @pad() + call void @pad() + call void @pad() + call void @pad() + call void @pad() + call void @pad() + call void @pad() + call void @pad() + call void @pad() + call void @pad() + call void @pad() + ret void +}