Index: llvm/include/llvm/Transforms/Scalar/SROA.h =================================================================== --- llvm/include/llvm/Transforms/Scalar/SROA.h +++ llvm/include/llvm/Transforms/Scalar/SROA.h @@ -123,6 +123,7 @@ PreservedAnalyses runImpl(Function &F, DominatorTree &RunDT, AssumptionCache &RunAC); + bool isolateNonCapturingCalls(Function &F); bool presplitLoadsAndStores(AllocaInst &AI, sroa::AllocaSlices &AS); AllocaInst *rewritePartition(AllocaInst &AI, sroa::AllocaSlices &AS, sroa::Partition &P); Index: llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h =================================================================== --- llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h +++ llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h @@ -27,7 +27,8 @@ /// (transitively) using this alloca. This also enforces that there is only /// ever one layer of bitcasts or GEPs between the alloca and the lifetime /// markers. -bool isAllocaPromotable(const AllocaInst *AI); +bool isAllocaPromotable(const AllocaInst *AI, + bool AllowNonCapturingCalls = false); /// Promote the specified list of alloca instructions into scalar /// registers, inserting PHI nodes as appropriate. Index: llvm/lib/Transforms/Scalar/SROA.cpp =================================================================== --- llvm/lib/Transforms/Scalar/SROA.cpp +++ llvm/lib/Transforms/Scalar/SROA.cpp @@ -4702,6 +4702,82 @@ return true; } +bool SROA::isolateNonCapturingCalls(Function &F) { + BasicBlock &EntryBB = F.getEntryBlock(); + bool Changed = false; + + SmallVector WorkList; + SmallVector GepList; + SmallVector CallsToConvert; + + for (Instruction &I : EntryBB) { + AllocaInst *AI = dyn_cast(&I); + if (!AI || !AI->getAllocatedType()->isSingleValueType()) + continue; + + WorkList.clear(); + WorkList.append(AI->user_begin(), AI->user_end()); + GepList.clear(); + CallsToConvert.clear(); + + while (!WorkList.empty()) { + Instruction *I = dyn_cast(WorkList.pop_back_val()); + if (!I) + continue; + + // Look through GEPs to see if the base address of the alloca is + // eventually used in a call. + if (auto *GEP = dyn_cast(I)) + if (GEP->hasAllZeroIndices()) { + GepList.push_back(GEP); + WorkList.append(GEP->user_begin(), GEP->user_end()); + continue; + } + + // We're only interested if the alloca is used by a non-intrinsic + // call instruction without operand bundles... + if (auto *CI = dyn_cast(I)) + if (!isa(I) && !CI->hasOperandBundles()) + CallsToConvert.push_back(CI); + } + + if (CallsToConvert.empty()) + continue; + + // ...and if the alloca would otherwise be promotable. + if (!isAllocaPromotable(AI, /*AllowNonCapturingCalls=*/true)) + continue; + + Changed = true; + LLVM_DEBUG(dbgs() << "SROA: Isolating calls using alloca: " << *AI << "\n"); + + // Create a new alloca, then replace users around the call(s). + IRBuilderTy Builder(AI); + Type *PointeeTy = AI->getType()->getPointerElementType(); + AllocaInst *NewAI = Builder.CreateAlloca(PointeeTy, nullptr, + AI->getName() + ".sroa.isolate"); + LLVM_DEBUG(dbgs() << "\tCreating new alloca: " << *NewAI << "\n"); + + for (CallInst *CI : CallsToConvert) { + LLVM_DEBUG(dbgs() << "\tIsolating call: " << *CI << "\n"); + Builder.SetInsertPoint(CI); + LoadInst *Load = Builder.CreateLoad(PointeeTy, AI); + Builder.CreateStore(Load, NewAI); + + Builder.SetInsertPoint(CI->getNextNonDebugInstruction()); + Load = Builder.CreateLoad(PointeeTy, NewAI); + Builder.CreateStore(Load, AI); + + for (unsigned i = 0; i < CI->arg_size(); ++i) + if (CI->getArgOperand(i) == AI || + is_contained(GepList, CI->getArgOperand(i))) + CI->setArgOperand(i, NewAI); + } + } + + return Changed; +} + PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT, AssumptionCache &RunAC) { LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); @@ -4709,6 +4785,12 @@ DT = &RunDT; AC = &RunAC; + // First look for allocas which are used in a call but not captured, and + // add a new alloca to cover uses before the first capture and store the + // value to the old alloca; this may enable additional optimizations for + // the uncaptured alloca. + bool Changed = isolateNonCapturingCalls(F); + BasicBlock &EntryBB = F.getEntryBlock(); for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end()); I != E; ++I) { @@ -4722,7 +4804,6 @@ } } - bool Changed = false; // A set of deleted alloca instruction pointers which should be removed from // the list of promotable allocas. SmallPtrSet DeletedAllocas; Index: llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp =================================================================== --- llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -61,7 +61,22 @@ STATISTIC(NumDeadAlloca, "Number of dead alloca's removed"); STATISTIC(NumPHIInsert, "Number of PHI nodes inserted"); -bool llvm::isAllocaPromotable(const AllocaInst *AI) { +static bool isCallNonCapturing(const CallInst *CI, const Instruction *I) { + // Reject calls with operand bundles for now. + if (CI->hasOperandBundles()) + return false; + + // Only allow a call if all uses of the alloca do not capture the pointer. + bool NoCaptures = true; + for (unsigned i = 0; i < CI->arg_size(); ++i) + if (CI->getArgOperand(i) == I) + NoCaptures &= CI->paramHasAttr(i, Attribute::NoCapture); + + return NoCaptures; +} + +bool llvm::isAllocaPromotable(const AllocaInst *AI, + bool AllowNonCapturingCalls) { // Only allow direct and non-volatile loads and stores... for (const User *U : AI->users()) { if (const LoadInst *LI = dyn_cast(U)) { @@ -86,11 +101,24 @@ } else if (const GetElementPtrInst *GEPI = dyn_cast(U)) { if (!GEPI->hasAllZeroIndices()) return false; - if (!onlyUsedByLifetimeMarkersOrDroppableInsts(GEPI)) + for (const User *U : GEPI->users()) { + const CallInst *CI = dyn_cast(U); + if (CI && !isa(CI) && AllowNonCapturingCalls && + isCallNonCapturing(CI, GEPI)) + continue; + + const IntrinsicInst *II = dyn_cast(U); + if (II && (II->isLifetimeStartOrEnd() || II->isDroppable())) + continue; + return false; + } } else if (const AddrSpaceCastInst *ASCI = dyn_cast(U)) { if (!onlyUsedByLifetimeMarkers(ASCI)) return false; + } else if (const CallInst *CI = dyn_cast(U)) { + if (!AllowNonCapturingCalls || !isCallNonCapturing(CI, AI)) + return false; } else { return false; } Index: llvm/test/Transforms/PhaseOrdering/lifetime-sanitizer.ll =================================================================== --- llvm/test/Transforms/PhaseOrdering/lifetime-sanitizer.ll +++ llvm/test/Transforms/PhaseOrdering/lifetime-sanitizer.ll @@ -9,7 +9,7 @@ declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) -declare void @foo(i8* nocapture) +declare void @foo(i8*) define void @asan() sanitize_address { entry: Index: llvm/test/Transforms/SROA/non-capturing-call.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/SROA/non-capturing-call.ll @@ -0,0 +1,127 @@ +; RUN: opt < %s -sroa -S | FileCheck %s +; RUN: opt < %s -passes=sroa -S | FileCheck %s + +; CHECK-LABEL: @alloca_used_in_call +define i32 @alloca_used_in_call(i32* nocapture nonnull readonly %data, i64 %n) { +; CHECK-NOT: %retval = alloca +; CHECK: %retval.sroa.isolate = alloca i32, align 4 +entry: + %retval = alloca i32, align 4 + store i32 0, i32* %retval, align 4 + br label %loop + +; CHECK-LABEL: loop: +; CHECK: %retval.0 = phi i32 [ 0, %entry ], [ %rdx.inc, %loop ] +loop: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %data, i64 %indvars.iv + %ld = load i32, i32* %arrayidx, align 4 + %rdx = load i32, i32* %retval, align 4 + %rdx.inc = add nsw i32 %rdx, %ld + store i32 %rdx.inc, i32* %retval, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, %n + br i1 %exitcond, label %loop, label %exit + +; CHECK-LABEL: exit: +; CHECK: store i32 %rdx.inc, i32* %retval.sroa.isolate, align 4 +; CHECK: %0 = call i32 @user_of_alloca(i32* nocapture nonnull %retval.sroa.isolate) +; CHECK: %1 = load i32, i32* %retval.sroa.isolate, align 4 +; CHECK: ret i32 %1 +exit: + %0 = call i32 @user_of_alloca(i32* nocapture nonnull %retval) + %1 = load i32, i32* %retval, align 4 + ret i32 %1 +} + +; CHECK-LABEL: @alloca_captured_in_call +define i32 @alloca_captured_in_call(i32* nocapture nonnull readonly %data, i64 %n) { +; CHECK-NOT: %retval.sroa.isolate = alloca i32, align 4 +entry: + %retval = alloca i32, align 4 + store i32 0, i32* %retval, align 4 + br label %loop + +; CHECK-LABEL: loop: +; CHECK-NOT: %retval.0 = phi i32 [ 0, %entry ], [ %rdx.inc, %loop ] +loop: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %data, i64 %indvars.iv + %ld = load i32, i32* %arrayidx, align 4 + %rdx = load i32, i32* %retval, align 4 + %rdx.inc = add nsw i32 %rdx, %ld + store i32 %rdx.inc, i32* %retval, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, %n + br i1 %exitcond, label %loop, label %exit + +exit: + %0 = call i32 @capture_of_alloca(i32* nonnull %retval) + %1 = load i32, i32* %retval, align 4 + ret i32 %1 +} + +; CHECK-LABEL: @alloca_with_gep_used_in_call +define i32 @alloca_with_gep_used_in_call(i32* nocapture nonnull readonly %data, i64 %n) { +; CHECK-NOT: %retval = alloca +; CHECK: %retval.sroa.isolate = alloca i32, align 4 +entry: + %retval = alloca i32, align 4 + store i32 0, i32* %retval, align 4 + br label %loop + +; CHECK-LABEL: loop: +; CHECK: %retval.0 = phi i32 [ 0, %entry ], [ %rdx.inc, %loop ] +loop: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %data, i64 %indvars.iv + %ld = load i32, i32* %arrayidx, align 4 + %rdx = load i32, i32* %retval, align 4 + %rdx.inc = add nsw i32 %rdx, %ld + store i32 %rdx.inc, i32* %retval, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, %n + br i1 %exitcond, label %loop, label %exit + +; CHECK-LABEL: exit: +; CHECK: store i32 %rdx.inc, i32* %retval.sroa.isolate, align 4 +; CHECK: %0 = call i32 @user_of_alloca(i32* nocapture nonnull %retval.sroa.isolate) +; CHECK: %1 = load i32, i32* %retval.sroa.isolate, align 4 +; CHECK: ret i32 %1 +exit: + %gep = getelementptr i32, i32* %retval, i32 0 + %0 = call i32 @user_of_alloca(i32* nocapture nonnull %gep) + %1 = load i32, i32* %retval, align 4 + ret i32 %1 +} + +; CHECK-LABEL: @alloca_captured_second_arg +define i32 @alloca_captured_second_arg(i32* nocapture nonnull readonly %data, i64 %n) { +; CHECK-NOT: %retval.sroa.isolate = alloca i32, align 4 +entry: + %retval = alloca i32, align 4 + store i32 0, i32* %retval, align 4 + br label %loop + +; CHECK-LABEL: loop: +; CHECK-NOT: %retval.0 = phi i32 [ 0, %entry ], [ %rdx.inc, %loop ] +loop: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %data, i64 %indvars.iv + %ld = load i32, i32* %arrayidx, align 4 + %rdx = load i32, i32* %retval, align 4 + %rdx.inc = add nsw i32 %rdx, %ld + store i32 %rdx.inc, i32* %retval, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, %n + br i1 %exitcond, label %loop, label %exit + +exit: + %0 = call i32 @capture_with_multiple_args(i32* nocapture nonnull %retval, i32* nonnull %retval) + %1 = load i32, i32* %retval, align 4 + ret i32 %1 +} + +declare dso_local i32 @user_of_alloca(i32* nocapture nonnull) +declare dso_local i32 @capture_of_alloca(i32 *nonnull) +declare dso_local i32 @capture_with_multiple_args(i32* nocapture nonnull, i32* nonnull) \ No newline at end of file