Index: llvm/include/llvm/Transforms/Utils/CodeExtractor.h =================================================================== --- llvm/include/llvm/Transforms/Utils/CodeExtractor.h +++ llvm/include/llvm/Transforms/Utils/CodeExtractor.h @@ -27,6 +27,7 @@ class BlockFrequency; class BlockFrequencyInfo; class BranchProbabilityInfo; +class CallInst; class DominatorTree; class Function; class Instruction; @@ -164,10 +165,9 @@ DenseMap &ExitWeights, BranchProbabilityInfo *BPI); - void emitCallAndSwitchStatement(Function *newFunction, - BasicBlock *newHeader, - ValueSet &inputs, - ValueSet &outputs); + CallInst *emitCallAndSwitchStatement(Function *newFunction, + BasicBlock *newHeader, + ValueSet &inputs, ValueSet &outputs); }; } // end namespace llvm Index: llvm/lib/Transforms/Utils/CodeExtractor.cpp =================================================================== --- llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -883,9 +883,10 @@ /// emitCallAndSwitchStatement - This method sets up the caller side by adding /// the call instruction, splitting any PHI nodes in the header block as /// necessary. -void CodeExtractor:: -emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, - ValueSet &inputs, ValueSet &outputs) { +CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction, + BasicBlock *codeReplacer, + ValueSet &inputs, + ValueSet &outputs) { // Emit a call to the new function, passing in: *pointer to struct (if // aggregating parameters), or plan inputs and allocated memory for outputs std::vector params, StructValues, ReloadOutputs, Reloads; @@ -893,6 +894,7 @@ Module *M = newFunction->getParent(); LLVMContext &Context = M->getContext(); const DataLayout &DL = M->getDataLayout(); + CallInst *call = nullptr; // Add inputs as params, or to be filled into the struct for (Value *input : inputs) @@ -943,8 +945,8 @@ } // Emit the call to the function - CallInst *call = CallInst::Create(newFunction, params, - NumExitBlocks > 1 ? "targetBlock" : ""); + call = CallInst::Create(newFunction, params, + NumExitBlocks > 1 ? "targetBlock" : ""); // Add debug location to the new call, if the original function has debug // info. In that case, the terminator of the entry block of the extracted // function contains the first debug location of the extracted function, @@ -1116,6 +1118,8 @@ TheSwitch->removeCase(SwitchInst::CaseIt(TheSwitch, NumExitBlocks-1)); break; } + + return call; } void CodeExtractor::moveCodeToFunction(Function *newFunction) { @@ -1177,6 +1181,71 @@ MDBuilder(TI->getContext()).createBranchWeights(BranchWeights)); } +/// Scan the extraction region for lifetime markers which reference inputs. +/// Erase these markers. Return the inputs which were referenced. +/// +/// The extraction region is defined by a set of blocks (\p Blocks), and a set +/// of allocas which will be moved from the caller function into the extracted +/// function (\p SunkAllocas). +static SetVector +eraseLifetimeMarkersOnInputs(const SetVector &Blocks, + const SetVector &SunkAllocas) { + SetVector InputObjectsWithLifetime; + for (BasicBlock *BB : Blocks) { + for (auto It = BB->begin(), End = BB->end(); It != End;) { + auto *II = dyn_cast(&*It); + ++It; + if (!II || !II->isLifetimeStartOrEnd()) + continue; + + // Get the memory operand of the lifetime marker. If the underlying + // object is a sunk alloca, or is otherwise defined in the extraction + // region, the lifetime marker must not be erased. + Value *Mem = II->getOperand(1)->stripInBoundsOffsets(); + if (SunkAllocas.count(Mem) || definedInRegion(Blocks, Mem)) + continue; + + InputObjectsWithLifetime.insert(Mem); + II->eraseFromParent(); + } + } + return InputObjectsWithLifetime; +} + +/// Insert lifetime start/end markers surrounding the call to the new function +/// for objects defined in the caller. +static void insertLifetimeMarkersSurroundingCall( + Module *M, const SetVector &InputObjectsWithLifetime, + CallInst *TheCall) { + if (InputObjectsWithLifetime.empty()) + return; + + LLVMContext &Ctx = M->getContext(); + auto Int8PtrTy = Type::getInt8PtrTy(Ctx); + auto NegativeOne = ConstantInt::getSigned(Type::getInt64Ty(Ctx), -1); + auto LifetimeStartFn = llvm::Intrinsic::getDeclaration( + M, llvm::Intrinsic::lifetime_start, Int8PtrTy); + auto LifetimeEndFn = llvm::Intrinsic::getDeclaration( + M, llvm::Intrinsic::lifetime_end, Int8PtrTy); + for (Value *Mem : InputObjectsWithLifetime) { + assert((!isa(Mem) || + cast(Mem)->getFunction() == TheCall->getFunction()) && + "Input memory not defined in original function"); + Value *MemAsI8Ptr = nullptr; + if (Mem->getType() == Int8PtrTy) + MemAsI8Ptr = Mem; + else + MemAsI8Ptr = + CastInst::CreatePointerCast(Mem, Int8PtrTy, "lt.cast", TheCall); + + auto StartMarker = + CallInst::Create(LifetimeStartFn, {NegativeOne, MemAsI8Ptr}); + StartMarker->insertBefore(TheCall); + auto EndMarker = CallInst::Create(LifetimeEndFn, {NegativeOne, MemAsI8Ptr}); + EndMarker->insertAfter(TheCall); + } +} + Function *CodeExtractor::extractCodeRegion() { if (!isEligible()) return nullptr; @@ -1291,11 +1360,17 @@ cast(II)->moveBefore(TI); } + // Collect objects which are inputs to the extraction region and also + // referenced by lifetime start/end markers within it. The effects of these + // markers must be replicated in the calling function to prevent the stack + // coloring pass from merging slots which store input objects. + ValueSet InputObjectsWithLifetime = + eraseLifetimeMarkersOnInputs(Blocks, SinkingCands); + // Construct new function based on inputs/outputs & add allocas for all defs. - Function *newFunction = constructFunction(inputs, outputs, header, - newFuncRoot, - codeReplacer, oldFunction, - oldFunction->getParent()); + Function *newFunction = + constructFunction(inputs, outputs, header, newFuncRoot, codeReplacer, + oldFunction, oldFunction->getParent()); // Update the entry count of the function. if (BFI) { @@ -1306,10 +1381,16 @@ BFI->setBlockFreq(codeReplacer, EntryFreq.getFrequency()); } - emitCallAndSwitchStatement(newFunction, codeReplacer, inputs, outputs); + CallInst *TheCall = + emitCallAndSwitchStatement(newFunction, codeReplacer, inputs, outputs); moveCodeToFunction(newFunction); + // Replicate the effects of any lifetime start/end markers which referenced + // input objects in the extraction region by placing markers around the call. + insertLifetimeMarkersSurroundingCall(oldFunction->getParent(), + InputObjectsWithLifetime, TheCall); + // Propagate personality info to the new function if there is one. if (oldFunction->hasPersonalityFn()) newFunction->setPersonalityFn(oldFunction->getPersonalityFn()); Index: llvm/test/Transforms/CodeExtractor/PartialInlineAlloca4.ll =================================================================== --- llvm/test/Transforms/CodeExtractor/PartialInlineAlloca4.ll +++ llvm/test/Transforms/CodeExtractor/PartialInlineAlloca4.ll @@ -6,10 +6,14 @@ @g = external local_unnamed_addr global i32, align 4 +; CHECK-LABEL: define{{.*}}@caller( +; CHECK: call void @llvm.lifetime.start.p0i8(i64 -1, i8* %tmp.i) +; CHECK-NEXT: call void @callee_unknown_use1.{{.*}}(i8* %tmp.i +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* %tmp.i) + define i32 @callee_unknown_use1(i32 %arg) local_unnamed_addr #0 { ; CHECK-LABEL:define{{.*}}@callee_unknown_use1.{{[0-9]}} ; CHECK-NOT: alloca -; CHECK: call void @llvm.lifetime bb: %tmp = alloca i8, align 4 %tmp2 = load i32, i32* @g, align 4, !tbaa !2 Index: llvm/test/Transforms/CodeExtractor/PartialInlineAlloca5.ll =================================================================== --- llvm/test/Transforms/CodeExtractor/PartialInlineAlloca5.ll +++ llvm/test/Transforms/CodeExtractor/PartialInlineAlloca5.ll @@ -9,7 +9,6 @@ define i32 @callee_unknown_use2(i32 %arg) local_unnamed_addr #0 { ; CHECK-LABEL:define{{.*}}@callee_unknown_use2.{{[0-9]}} ; CHECK-NOT: alloca -; CHECK: call void @llvm.lifetime bb: %tmp = alloca i32, align 4 %tmp1 = bitcast i32* %tmp to i8* Index: llvm/test/Transforms/HotColdSplit/lifetime-markers-on-inputs.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/HotColdSplit/lifetime-markers-on-inputs.ll @@ -0,0 +1,66 @@ +; RUN: opt -S -hotcoldsplit < %s 2>&1 | FileCheck %s + +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) + +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) + +declare void @use(i8*) + +declare void @cold_use2(i8*, i8*) cold + +; CHECK-LABEL: define {{.*}}@foo( +define void @foo() { +entry: + %local1 = alloca i256 + %local2 = alloca i256 + %local1_cast = bitcast i256* %local1 to i8* + %local2_cast = bitcast i256* %local2 to i8* + br i1 undef, label %normalPath, label %outlinedPath + +normalPath: + ; These two uses of stack slots are non-overlapping. Based on this alone, + ; the stack slots could be merged. + call void @llvm.lifetime.start.p0i8(i64 1, i8* %local1_cast) + call void @use(i8* %local1_cast) + call void @llvm.lifetime.end.p0i8(i64 1, i8* %local1_cast) + call void @llvm.lifetime.start.p0i8(i64 1, i8* %local2_cast) + call void @use(i8* %local2_cast) + call void @llvm.lifetime.end.p0i8(i64 1, i8* %local2_cast) + ret void + +; CHECK-LABEL: codeRepl: +; CHECK: [[local1_cast:%.*]] = bitcast i256* %local1 to i8* +; CHECK: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[local1_cast]]) +; CHECK: [[local2_cast:%.*]] = bitcast i256* %local2 to i8* +; CHECK: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[local2_cast]]) +; CHECK: call i1 @foo.cold.1(i8* %local1_cast, i8* %local2_cast) +; CHECK: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[local2_cast]]) +; CHECK: call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[local1_cast]]) +; CHECK: br i1 + +outlinedPath: + ; These two uses of stack slots are overlapping. This should prevent + ; merging of stack slots. CodeExtractor must replicate the effects of + ; these markers in the caller to inhibit stack coloring. + %gep1 = getelementptr inbounds i8, i8* %local1_cast, i64 1 + call void @llvm.lifetime.start.p0i8(i64 1, i8* %gep1) + call void @llvm.lifetime.start.p0i8(i64 1, i8* %local2_cast) + call void @cold_use2(i8* %local1_cast, i8* %local2_cast) + call void @llvm.lifetime.end.p0i8(i64 1, i8* %gep1) + call void @llvm.lifetime.end.p0i8(i64 1, i8* %local2_cast) + br i1 undef, label %outlinedPath2, label %outlinedPathExit + +outlinedPath2: + ; These extra lifetime markers are used to test that we emit only one + ; pair of guard markers in the caller per memory object. + call void @llvm.lifetime.start.p0i8(i64 1, i8* %local2_cast) + call void @use(i8* %local2_cast) + call void @llvm.lifetime.end.p0i8(i64 1, i8* %local2_cast) + ret void + +outlinedPathExit: + ret void +} + +; CHECK-LABEL: define {{.*}}@foo.cold.1( +; CHECK-NOT: @llvm.lifetime