Index: lib/Analysis/InlineCost.cpp =================================================================== --- lib/Analysis/InlineCost.cpp +++ lib/Analysis/InlineCost.cpp @@ -137,6 +137,13 @@ /// Keep track of values which map to a pointer base and constant offset. DenseMap> ConstantOffsetPtrs; + /// Model the elimination of repeated loads that is expected to happen + /// whenever we simplify away the stores that would otherwise cause them to be + /// loads. + bool EnableLoadElimination; + SmallPtrSet LoadAddrSet; + int LoadEliminationCost; + // Custom simplification helper routines. bool isAllocaDerivedArg(Value *V); bool lookupSROAArgAndCost(Value *V, Value *&Arg, @@ -145,6 +152,7 @@ void disableSROA(Value *V); void accumulateSROACost(DenseMap::iterator CostIt, int InstructionCost); + void disableLoadElimination(); bool isGEPFree(GetElementPtrInst &GEP); bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); bool simplifyCallSite(Function *F, CallSite CS); @@ -227,10 +235,11 @@ ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false), HasFrameEscape(false), AllocatedSize(0), NumInstructions(0), NumVectorInstructions(0), FiftyPercentVectorBonus(0), - TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0), - NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0), - NumConstantPtrDiffs(0), NumInstructionsSimplified(0), - SROACostSavings(0), SROACostSavingsLost(0) {} + TenPercentVectorBonus(0), VectorBonus(0), EnableLoadElimination(true), + LoadEliminationCost(0), NumConstantArgs(0), NumConstantOffsetPtrArgs(0), + NumAllocaArgs(0), NumConstantPtrCmps(0), NumConstantPtrDiffs(0), + NumInstructionsSimplified(0), SROACostSavings(0), + SROACostSavingsLost(0), LoadEliminationCostSavings(0) {} bool analyzeCall(CallSite CS); @@ -247,6 +256,7 @@ unsigned NumInstructionsSimplified; unsigned SROACostSavings; unsigned SROACostSavingsLost; + unsigned LoadEliminationCostSavings; void dump(); }; @@ -285,6 +295,7 @@ SROACostSavings -= CostIt->second; SROACostSavingsLost += CostIt->second; SROAArgCosts.erase(CostIt); + disableLoadElimination(); } /// \brief If 'V' maps to a SROA candidate, disable SROA for it. @@ -302,6 +313,12 @@ SROACostSavings += InstructionCost; } +void CallAnalyzer::disableLoadElimination() { + Cost += LoadEliminationCost; + LoadEliminationCostSavings = 0; + EnableLoadElimination = false; +} + /// \brief Accumulate a constant GEP offset into an APInt if possible. /// /// Returns false if unable to compute the offset for any reason. Respects any @@ -816,6 +833,17 @@ disableSROA(CostIt); } + // If the data are already loaded from this address and there is no store and + // call in the function, this load is likely to be redundant and can be + // eliminated. + if (EnableLoadElimination) { + if (!LoadAddrSet.insert(I.getPointerOperand()).second) { + LoadEliminationCost += InlineConstants::InstrCost; + LoadEliminationCostSavings += InlineConstants::InstrCost; + return true; + } + } + return false; } @@ -831,6 +859,16 @@ disableSROA(CostIt); } + // The store can potentially clobber the repeated loads and prevent them from + // elimination. + // FIXME: + // 1. We can probably keep an initial set of eliminatable loads substracted + // from the cost even when we finally see a store. We just need to disable + // *further* accumulation of elimination savings. + // 2. We should probably at some point thread MemorySSA for the callee into + // this and then use that to actually compute *really* precise savings. + if (EnableLoadElimination) + disableLoadElimination(); return false; } @@ -894,6 +932,8 @@ } bool CallAnalyzer::visitCallSite(CallSite CS) { + if (EnableLoadElimination) + disableLoadElimination(); if (CS.hasFnAttr(Attribute::ReturnsTwice) && !F.hasFnAttribute(Attribute::ReturnsTwice)) { // This aborts the entire analysis. @@ -1453,6 +1493,7 @@ DEBUG_PRINT_STAT(NumInstructions); DEBUG_PRINT_STAT(SROACostSavings); DEBUG_PRINT_STAT(SROACostSavingsLost); + DEBUG_PRINT_STAT(LoadEliminationCostSavings); DEBUG_PRINT_STAT(ContainsNoDuplicateCall); DEBUG_PRINT_STAT(Cost); DEBUG_PRINT_STAT(Threshold); Index: test/Transforms/Inline/redundant-loads.ll =================================================================== --- /dev/null +++ test/Transforms/Inline/redundant-loads.ll @@ -0,0 +1,98 @@ +; RUN: opt -inline < %s -S -o - -inline-threshold=0 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define void @outer1(i32* %a) { +; CHECK-LABEL: @outer1( +; CHECK-NOT: call void @inner1 + %b = alloca i32 + call void @inner1(i32* %a, i32* %b) + ret void +} + +define void @inner1(i32* %a, i32* %b) { + %1 = load i32, i32* %a + store i32 %1, i32 * %b + %2 = load i32, i32* %a + store i32 %2, i32 * %b + %3 = load i32, i32* %a + store i32 %3, i32 * %b + %4 = load i32, i32* %a + store i32 %4, i32 * %b + %5 = load i32, i32* %a + store i32 %5, i32 * %b + %6 = load i32, i32* %a + store i32 %6, i32 * %b + %7 = load i32, i32* %a + store i32 %7, i32 * %b + %8 = load i32, i32* %a + store i32 %8, i32 * %b + %9 = load i32, i32* %a + ret void +} + +define void @outer2(i32* %a, i32* %b) { +; CHECK-LABEL: @outer2( +; CHECK: call void @inner2 + call void @inner2(i32* %a, i32* %b) + ret void +} + +define void @inner2(i32* %a, i32* %b) { + %1 = load i32, i32* %a + store i32 %1, i32 * %b + %2 = load i32, i32* %a + store i32 %2, i32 * %b + %3 = load i32, i32* %a + store i32 %3, i32 * %b + %4 = load i32, i32* %a + store i32 %4, i32 * %b + %5 = load i32, i32* %a + ret void +} + +define void @outer3(i32* %a) { +; CHECK-LABEL: @outer3( +; CHECK: call void @inner3 + call void @inner3(i32* %a) + ret void +} + +declare void @foo() + +define void @inner3(i32* %a) { + %1 = load i32, i32* %a + call void @foo() + %2 = load i32, i32* %a + ret void +} + +define void @outer4(i32* %a, i32* %b, i32* %c) { +; CHECK-LABEL: @outer4( +; CHECK-NOT: call void @inner4 + call void @inner4(i32* %a, i32* %b, i32* %c, i1 false) + ret void +} + +define void @inner4(i32* %a, i32* %b, i32* %c, i1 %pred) { + %1 = load i32, i32* %a + %2 = load i32, i32* %b + %3 = load i32, i32* %c + br i1 %pred, label %cond_true, label %cond_false + +cond_true: + store i32 %1, i32 * %b + br label %cond_false + +cond_false: + %4 = load i32, i32* %a + %5 = load i32, i32* %b + %6 = load i32, i32* %c + %7 = add i32 %1, %2 + %8 = add i32 %3, %7 + %9 = add i32 %4, %8 + %10 = add i32 %5, %9 + %11 = add i32 %6, %10 + ret void +}