Index: llvm/trunk/include/llvm/Analysis/ValueTracking.h =================================================================== --- llvm/trunk/include/llvm/Analysis/ValueTracking.h +++ llvm/trunk/include/llvm/Analysis/ValueTracking.h @@ -366,6 +366,10 @@ /// operands are not memory dependent. bool mayBeMemoryDependent(const Instruction &I); + /// Return true if it is an intrinsic that cannot be speculated but also + /// cannot trap. + bool isAssumeLikeIntrinsic(const Instruction *I); + /// Return true if it is valid to use the assumptions provided by an /// assume intrinsic, I, at the point in the control-flow identified by the /// context instruction, CxtI. Index: llvm/trunk/lib/Analysis/InlineCost.cpp =================================================================== --- llvm/trunk/lib/Analysis/InlineCost.cpp +++ llvm/trunk/lib/Analysis/InlineCost.cpp @@ -25,6 +25,7 @@ #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DataLayout.h" @@ -171,6 +172,13 @@ /// constant arguments. DenseMap KnownSuccessors; + /// Model the elimination of repeated loads that is expected to happen + /// whenever we simplify away the stores that would otherwise cause them to be + /// loads. + bool EnableLoadElimination; + SmallPtrSet LoadAddrSet; + int LoadEliminationCost; + // Custom simplification helper routines. bool isAllocaDerivedArg(Value *V); bool lookupSROAArgAndCost(Value *V, Value *&Arg, @@ -180,6 +188,7 @@ void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB); void accumulateSROACost(DenseMap::iterator CostIt, int InstructionCost); + void disableLoadElimination(); bool isGEPFree(GetElementPtrInst &GEP); bool canFoldInboundsGEP(GetElementPtrInst &I); bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); @@ -275,10 +284,10 @@ ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false), HasFrameEscape(false), AllocatedSize(0), NumInstructions(0), NumVectorInstructions(0), VectorBonus(0), SingleBBBonus(0), - NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), - NumConstantPtrCmps(0), NumConstantPtrDiffs(0), - NumInstructionsSimplified(0), SROACostSavings(0), - SROACostSavingsLost(0) {} + EnableLoadElimination(true), LoadEliminationCost(0), NumConstantArgs(0), + NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0), + NumConstantPtrDiffs(0), NumInstructionsSimplified(0), + SROACostSavings(0), SROACostSavingsLost(0) {} bool analyzeCall(CallSite CS); @@ -333,6 +342,7 @@ SROACostSavings -= CostIt->second; SROACostSavingsLost += CostIt->second; SROAArgCosts.erase(CostIt); + disableLoadElimination(); } /// \brief If 'V' maps to a SROA candidate, disable SROA for it. @@ -350,6 +360,13 @@ SROACostSavings += InstructionCost; } +void CallAnalyzer::disableLoadElimination() { + if (EnableLoadElimination) { + Cost += LoadEliminationCost; + EnableLoadElimination = false; + } +} + /// \brief Accumulate a constant GEP offset into an APInt if possible. /// /// Returns false if unable to compute the offset for any reason. Respects any @@ -1076,6 +1093,15 @@ disableSROA(CostIt); } + // If the data is already loaded from this address and hasn't been clobbered + // by any stores or calls, this load is likely to be redundant and can be + // eliminated. + if (EnableLoadElimination && + !LoadAddrSet.insert(I.getPointerOperand()).second) { + LoadEliminationCost += InlineConstants::InstrCost; + return true; + } + return false; } @@ -1091,6 +1117,15 @@ disableSROA(CostIt); } + // The store can potentially clobber loads and prevent repeated loads from + // being eliminated. + // FIXME: + // 1. We can probably keep an initial set of eliminatable loads substracted + // from the cost even when we finally see a store. We just need to disable + // *further* accumulation of elimination savings. + // 2. We should probably at some point thread MemorySSA for the callee into + // this and then use that to actually compute *really* precise savings. + disableLoadElimination(); return false; } @@ -1173,6 +1208,8 @@ if (IntrinsicInst *II = dyn_cast(CS.getInstruction())) { switch (II->getIntrinsicID()) { default: + if (!CS.onlyReadsMemory() && !isAssumeLikeIntrinsic(II)) + disableLoadElimination(); return Base::visitCallSite(CS); case Intrinsic::load_relative: @@ -1183,6 +1220,7 @@ case Intrinsic::memset: case Intrinsic::memcpy: case Intrinsic::memmove: + disableLoadElimination(); // SROA can usually chew through these intrinsics, but they aren't free. return false; case Intrinsic::localescape: @@ -1209,6 +1247,8 @@ Cost += InlineConstants::CallPenalty; } + if (!CS.onlyReadsMemory()) + disableLoadElimination(); return Base::visitCallSite(CS); } @@ -1223,8 +1263,11 @@ // Next, check if this happens to be an indirect function call to a known // function in this inline context. If not, we've done all we can. Function *F = dyn_cast_or_null(SimplifiedValues.lookup(Callee)); - if (!F) + if (!F) { + if (!CS.onlyReadsMemory()) + disableLoadElimination(); return Base::visitCallSite(CS); + } // If we have a constant that we are calling as a function, we can peer // through it and see the function target. This happens not infrequently @@ -1241,6 +1284,8 @@ Cost -= std::max(0, CA.getThreshold() - CA.getCost()); } + if (!F->onlyReadsMemory()) + disableLoadElimination(); return Base::visitCallSite(CS); } @@ -1843,6 +1888,7 @@ DEBUG_PRINT_STAT(NumInstructions); DEBUG_PRINT_STAT(SROACostSavings); DEBUG_PRINT_STAT(SROACostSavingsLost); + DEBUG_PRINT_STAT(LoadEliminationCost); DEBUG_PRINT_STAT(ContainsNoDuplicateCall); DEBUG_PRINT_STAT(Cost); DEBUG_PRINT_STAT(Threshold); Index: llvm/trunk/lib/Analysis/ValueTracking.cpp =================================================================== --- llvm/trunk/lib/Analysis/ValueTracking.cpp +++ llvm/trunk/lib/Analysis/ValueTracking.cpp @@ -483,7 +483,7 @@ } // Is this an intrinsic that cannot be speculated but also cannot trap? -static bool isAssumeLikeIntrinsic(const Instruction *I) { +bool llvm::isAssumeLikeIntrinsic(const Instruction *I) { if (const CallInst *CI = dyn_cast(I)) if (Function *F = CI->getCalledFunction()) switch (F->getIntrinsicID()) { Index: llvm/trunk/test/Transforms/Inline/redundant-loads.ll =================================================================== --- llvm/trunk/test/Transforms/Inline/redundant-loads.ll +++ llvm/trunk/test/Transforms/Inline/redundant-loads.ll @@ -0,0 +1,186 @@ +; RUN: opt -inline < %s -S -o - -inline-threshold=3 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare void @pad() readnone + +define void @outer1(i32* %a) { +; CHECK-LABEL: @outer1( +; CHECK-NOT: call void @inner1 + %b = alloca i32 + call void @inner1(i32* %a, i32* %b) + ret void +} + +define void @inner1(i32* %a, i32* %b) { + %1 = load i32, i32* %a + store i32 %1, i32 * %b ; This store does not clobber the first load. + %2 = load i32, i32* %a + call void @pad() + %3 = load i32, i32* %a + ret void +} + + +define void @outer2(i32* %a, i32* %b) { +; CHECK-LABEL: @outer2( +; CHECK: call void @inner2 + call void @inner2(i32* %a, i32* %b) + ret void +} + +define void @inner2(i32* %a, i32* %b) { + %1 = load i32, i32* %a + store i32 %1, i32 * %b ; This store clobbers the first load. + %2 = load i32, i32* %a + call void @pad() + ret void +} + + +define void @outer3(i32* %a) { +; CHECK-LABEL: @outer3( +; CHECK: call void @inner3 + call void @inner3(i32* %a) + ret void +} + +declare void @ext() + +define void @inner3(i32* %a) { + %1 = load i32, i32* %a + call void @ext() ; This call clobbers the first load. + %2 = load i32, i32* %a + ret void +} + + +define void @outer4(i32* %a, i32* %b, i32* %c) { +; CHECK-LABEL: @outer4( +; CHECK-NOT: call void @inner4 + call void @inner4(i32* %a, i32* %b, i1 false) + ret void +} + +define void @inner4(i32* %a, i32* %b, i1 %pred) { + %1 = load i32, i32* %a + br i1 %pred, label %cond_true, label %cond_false + +cond_true: + store i32 %1, i32 * %b ; This store does not clobber the first load. + br label %cond_false + +cond_false: + %2 = load i32, i32* %a + call void @pad() + %3 = load i32, i32* %a + %4 = load i32, i32* %a + ret void +} + + +define void @outer5(i32* %a, double %b) { +; CHECK-LABEL: @outer5( +; CHECK-NOT: call void @inner5 + call void @inner5(i32* %a, double %b) + ret void +} + +declare double @llvm.fabs.f64(double) nounwind readnone + +define void @inner5(i32* %a, double %b) { + %1 = load i32, i32* %a + %2 = call double @llvm.fabs.f64(double %b) ; This intrinsic does not clobber the first load. + %3 = load i32, i32* %a + call void @pad() + ret void +} + +define void @outer6(i32* %a, i8* %ptr) { +; CHECK-LABEL: @outer6( +; CHECK-NOT: call void @inner6 + call void @inner6(i32* %a, i8* %ptr) + ret void +} + +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) argmemonly nounwind + +define void @inner6(i32* %a, i8* %ptr) { + %1 = load i32, i32* %a + call void @llvm.lifetime.start.p0i8(i64 32, i8* %ptr) ; This intrinsic does not clobber the first load. + %2 = load i32, i32* %a + call void @pad() + %3 = load i32, i32* %a + ret void +} + +define void @outer7(i32* %a) { +; CHECK-LABEL: @outer7( +; CHECK-NOT: call void @inner7 + call void @inner7(i32* %a) + ret void +} + +declare void @ext2() readnone + +define void @inner7(i32* %a) { + %1 = load i32, i32* %a + call void @ext2() ; This call does not clobber the first load. + %2 = load i32, i32* %a + ret void +} + + +define void @outer8(i32* %a) { +; CHECK-LABEL: @outer8( +; CHECK-NOT: call void @inner8 + call void @inner8(i32* %a, void ()* @ext2) + ret void +} + +define void @inner8(i32* %a, void ()* %f) { + %1 = load i32, i32* %a + call void %f() ; This indirect call does not clobber the first load. + %2 = load i32, i32* %a + call void @pad() + call void @pad() + call void @pad() + call void @pad() + call void @pad() + call void @pad() + call void @pad() + call void @pad() + call void @pad() + call void @pad() + call void @pad() + call void @pad() + ret void +} + + +define void @outer9(i32* %a) { +; CHECK-LABEL: @outer9( +; CHECK: call void @inner9 + call void @inner9(i32* %a, void ()* @ext) + ret void +} + +define void @inner9(i32* %a, void ()* %f) { + %1 = load i32, i32* %a + call void %f() ; This indirect call clobbers the first load. + %2 = load i32, i32* %a + call void @pad() + call void @pad() + call void @pad() + call void @pad() + call void @pad() + call void @pad() + call void @pad() + call void @pad() + call void @pad() + call void @pad() + call void @pad() + call void @pad() + ret void +}