Index: llvm/lib/Analysis/InlineCost.cpp =================================================================== --- llvm/lib/Analysis/InlineCost.cpp +++ llvm/lib/Analysis/InlineCost.cpp @@ -50,7 +50,7 @@ cl::desc("Control the amount of inlining to perform (default = 225)")); static cl::opt HintThreshold( - "inlinehint-threshold", cl::Hidden, cl::init(325), cl::ZeroOrMore, + "inlinehint-threshold", cl::Hidden, cl::init(325), cl::ZeroOrMore, cl::desc("Threshold for inlining functions with inline hint")); static cl::opt @@ -62,7 +62,7 @@ // PGO before we actually hook up inliner with analysis passes such as BPI and // BFI. static cl::opt ColdThreshold( - "inlinecold-threshold", cl::Hidden, cl::init(45), cl::ZeroOrMore, + "inlinecold-threshold", cl::Hidden, cl::init(45), cl::ZeroOrMore, cl::desc("Threshold for inlining functions with cold attribute")); static cl::opt @@ -91,6 +91,12 @@ cl::desc("Compute the full inline cost of a call site even when the cost " "exceeds the threshold.")); +static cl::opt OptPerformCallerAnalysis( + "inline-perform-caller-analysis", cl::Hidden, cl::init(false), + cl::ZeroOrMore, + cl::desc("Perform extra analysis in the callsite's basic block to aid " + "inlining decision.")); + namespace { class CallAnalyzer : public InstVisitor { @@ -180,6 +186,10 @@ /// Keep track of values which map to a pointer base and constant offset. DenseMap> ConstantOffsetPtrs; + /// Keep track of constants that are stored on the caller stack, where a + /// pointer to the stack item is passed to the callee. + DenseMap, Constant *> CallerStackConstants; + /// Keep track of dead blocks due to the constant arguments. SetVector DeadBlocks; @@ -194,6 +204,10 @@ SmallPtrSet LoadAddrSet; int LoadEliminationCost = 0; + // TODO + bool PerformedCallerStoreSearch = false; + bool EnableCallerAnalysis; + // Custom simplification helper routines. bool isAllocaDerivedArg(Value *V); bool lookupSROAArgAndCost(Value *V, Value *&Arg, @@ -207,6 +221,10 @@ bool isGEPFree(GetElementPtrInst &GEP); bool canFoldInboundsGEP(GetElementPtrInst &I); bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); + bool calleePtrToCallerAllocaOffset(Value *&V, uint64_t &Offset); + bool searchForConstantAtOffset( + Value *V, DenseMap, Constant *>::iterator + &CallerStackConstantIt); bool simplifyCallSite(Function *F, CallBase &Call); template bool simplifyInstruction(Instruction &I, Callable Evaluate); @@ -300,7 +318,8 @@ CandidateCall(Call), Params(Params), Threshold(Params.DefaultThreshold), ComputeFullInlineCost(OptComputeFullInlineCost || Params.ComputeFullInlineCost || ORE), - EnableLoadElimination(true) {} + EnableLoadElimination(true), + EnableCallerAnalysis(OptPerformCallerAnalysis) {} InlineResult analyzeCall(CallBase &Call); @@ -378,6 +397,12 @@ addCost(LoadEliminationCost); LoadEliminationCost = 0; EnableLoadElimination = false; + // disableLoadElimination() is called whenever we might clobber future + // loads unexpectedly. This means we can't be sure what's on the caller's + // stack any more, so just delete all our cached information, and prevent + // anything further from being cached. + CallerStackConstants.clear(); + EnableCallerAnalysis = false; } } @@ -428,6 +453,72 @@ return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands); } +/// Given a pointer in the callee that points to the caller's stack, computes +/// the ponter's total offset off of the corresponding alloca. +/// +/// Asserts that V is a pointer to the caller's stack. +bool CallAnalyzer::calleePtrToCallerAllocaOffset(Value *&V, uint64_t &Offset) { + assert(isAllocaDerivedArg(V)); + std::pair BaseAndOffset = ConstantOffsetPtrs[V]; + if (!BaseAndOffset.first) + return false; + V = BaseAndOffset.first; + Offset = BaseAndOffset.second.getZExtValue(); + return true; +} + +/// Looks up a constant on the caller's stack. +/// +/// Value *V is a pointer in the callee that points to the caller' stack. +/// This will search the stores of the callsite's basic block for any stored +/// constants. +bool CallAnalyzer::searchForConstantAtOffset( + Value *V, DenseMap, Constant *>::iterator + &CallerStackConstantIt) { + if (!PerformedCallerStoreSearch && EnableCallerAnalysis) { + // Search the callsite's basic block. + // TODO possibly stop once we hit our candidate store as an optimization, + // and later continue the search from the place we left off if we are + // analyzing another load. + for (auto IRI = ++CandidateCall.getReverseIterator(); + IRI != CandidateCall.getParent()->rend(); IRI++) { + // If this is a store of a constant, to a pointer on the caller's stack + // with a constant offset, we will save the value. + if (StoreInst *SI = dyn_cast(&*IRI)) { + Constant *StoreVal = dyn_cast(SI->getValueOperand()); + if (!StoreVal) + continue; + + Value *PtrVal = SI->getPointerOperand(); + ConstantInt *Offset = stripAndComputeInBoundsConstantOffsets(PtrVal); + if (Offset && isa(PtrVal)) { + auto PtrAndOffset = std::make_pair(PtrVal, Offset->getZExtValue()); + // Won't insert if the value already exists, so older stores won't + // clobber the newer ones here. + CallerStackConstants.try_emplace(PtrAndOffset, StoreVal); + } + } else if (CallBase *Call = dyn_cast(&*IRI)) { + // Don't continue past some calls, as they may modify our stack data. + // FIXME: would be nice to perform a more precise analysis here. + IntrinsicInst *II = dyn_cast(Call); + if (!Call->onlyReadsMemory() && (!II || !isAssumeLikeIntrinsic(II))) + break; + } + } + PerformedCallerStoreSearch = true; + } + + uint64_t Offset; + if (!calleePtrToCallerAllocaOffset(V, Offset)) + return false; + // V is now the caller's alloca. + CallerStackConstantIt = CallerStackConstants.find(std::make_pair(V, Offset)); + // If the Constant pointer in the map is nullptr, we may have known the + // value previously, but a store in the callee has overwritten it with + // a non-constant. + return CallerStackConstantIt != CallerStackConstants.end() && CallerStackConstantIt->second; +} + bool CallAnalyzer::visitAlloca(AllocaInst &I) { // Check whether inlining will turn a dynamic alloca into a static // alloca and handle that case. @@ -1124,6 +1215,20 @@ Value *SROAArg; DenseMap::iterator CostIt; if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { + // Check here whether we can simplify this load instruction based on stores + // from the callsite's basic block. + Value *V = I.getPointerOperand(); + DenseMap, Constant *>::iterator + CallerStackConstantIt; + if (EnableCallerAnalysis && + searchForConstantAtOffset(V, CallerStackConstantIt)) { + // If our stored constants are of the same type, store this in + // SimplifiedValues. + if (I.getType() == CallerStackConstantIt->second->getType()) + SimplifiedValues[&I] = CallerStackConstantIt->second; + return true; + } + if (I.isSimple()) { accumulateSROACost(CostIt, InlineConstants::InstrCost); return true; @@ -1148,6 +1253,21 @@ Value *SROAArg; DenseMap::iterator CostIt; if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { + // Must invalidate constants stored here to the arguments. + Value *V = I.getPointerOperand(); + uint64_t Offset; + if (EnableCallerAnalysis && calleePtrToCallerAllocaOffset(V, Offset)) { + // V is now the caller's alloca. + auto PtrAndOffset = std::make_pair(V, Offset); + Value *StoreVal = I.getValueOperand(); + // Try to convert the store's operand to a constant. + Constant *StoreC = dyn_cast(StoreVal); + if (!StoreC) + StoreC = SimplifiedValues.lookup(StoreVal); + // Store either the final constant or a nullptr. + CallerStackConstants[PtrAndOffset] = StoreC; + } + if (I.isSimple()) { accumulateSROACost(CostIt, InlineConstants::InstrCost); return true; Index: llvm/test/Transforms/Inline/caller-stack-constants.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/Inline/caller-stack-constants.ll @@ -0,0 +1,55 @@ +; RUN: opt -S -Oz -inline-perform-caller-analysis %s | FileCheck %s +; Tests that the inliner can propagate some constants stored on the caller's +; stack to values used in the callee. +; Essentially tests that a C++ unique_ptr will have its destructor elided if +; it was just moved from, even in -Oz. + +%class.unique_ptr = type { %struct.Dummy* } +%struct.Dummy = type { i32 } + +@global_unique_ptr = global %class.unique_ptr zeroinitializer, align 8 + +; Similar to an operator=(unique_ptr&& other) or a move constructor +define void @transfer_ptr_ownership(%class.unique_ptr* %this, %class.unique_ptr* dereferenceable(8) %other) { +entry: + %ptr_ = getelementptr inbounds %class.unique_ptr, %class.unique_ptr* %other, i64 0, i32 0 + %0 = bitcast %class.unique_ptr* %other to i64* + %1 = load i64, i64* %0, align 8 + %2 = bitcast %class.unique_ptr* %this to i64* + store i64 %1, i64* %2, align 8 + store %struct.Dummy* null, %struct.Dummy** %ptr_, align 8 + ret void +} + +; Basically a unique_ptr destructor. +define void @destruct_if_not_null(%class.unique_ptr* %this) { +entry: + %ptr_ = getelementptr inbounds %class.unique_ptr, %class.unique_ptr* %this, i64 0, i32 0 + %0 = load %struct.Dummy*, %struct.Dummy** %ptr_, align 8 + %tobool = icmp eq %struct.Dummy* %0, null + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %entry + tail call void @external_destructor(%struct.Dummy* nonnull %0) + br label %if.end + +if.end: ; preds = %entry, %if.then + ret void +} + +define i32 @main() { +; CHECK-LABEL: @main( +entry: + %ref.tmp = alloca %class.unique_ptr, align 8 +; CHECK: call void @return_a_unique_ptr + call void @return_a_unique_ptr(%class.unique_ptr* nonnull sret %ref.tmp) +; CHECK-NOT: call void @transfer_ptr_ownership + call void @transfer_ptr_ownership(%class.unique_ptr* nonnull @global_unique_ptr, %class.unique_ptr* nonnull dereferenceable(8) %ref.tmp) +; CHECK-NOT: call void @destruct_if_not_null + call void @destruct_if_not_null(%class.unique_ptr* nonnull %ref.tmp) +; CHECK: ret i32 0 + ret i32 0 +} + +declare void @return_a_unique_ptr(%class.unique_ptr* sret) local_unnamed_addr +declare void @external_destructor(%struct.Dummy*) local_unnamed_addr