Index: llvm/lib/Transforms/InstCombine/InstructionCombining.cpp =================================================================== --- llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -3686,13 +3686,13 @@ /// beginning of DestBlock, which can only happen if it's safe to move the /// instruction past all of the instructions between it and the end of its /// block. -static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) { +static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock, + TargetLibraryInfo &TLI) { assert(I->getUniqueUndroppableUser() && "Invariants didn't hold!"); BasicBlock *SrcBlock = I->getParent(); - // Cannot move control-flow-involving, volatile loads, vaarg, etc. - if (isa(I) || I->isEHPad() || I->mayHaveSideEffects() || - I->isTerminator()) + // Cannot move control-flow-involving instructions. + if (isa(I) || I->isEHPad() || I->isTerminator()) return false; // Do not sink static or dynamic alloca instructions. Static allocas must @@ -3711,6 +3711,7 @@ if (CI->isConvergent()) return false; } + // We can only sink load instructions if there is nothing between the load and // the end of block that could change the value. if (I->mayReadFromMemory()) { @@ -3726,6 +3727,20 @@ return false; } + if (I->mayHaveSideEffects()) { + if (!wouldInstructionBeTriviallyDeadOnUnusedPaths(I, &TLI)) + return false; + // Avoid having to reason about intervening blocks. + if (DestBlock->getUniquePredecessor() != I->getParent()) + return false; + // We just handle the simplest case of the following instruction being the + // terminator for the block. + // TODO: Handle this more generally by identifying what kind of instructions + // we're safe to move this side-effecting instruction past. + if (&*(std::next(I->getIterator())) != I->getParent()->getTerminator()) + return false; + } + I->dropDroppableUses([DestBlock](const Use *U) { if (auto *I = dyn_cast(U->getUser())) return I->getParent() != DestBlock; @@ -3895,7 +3910,7 @@ if (OptBB) { auto *UserParent = *OptBB; // Okay, the CFG is simple enough, try to sink this instruction. - if (TryToSinkInstruction(I, UserParent)) { + if (TryToSinkInstruction(I, UserParent, TLI)) { LLVM_DEBUG(dbgs() << "IC: Sink: " << *I << '\n'); MadeIRChange = true; // We'll add uses of the sunk instruction below, but since Index: llvm/lib/Transforms/Utils/Local.cpp =================================================================== --- llvm/lib/Transforms/Utils/Local.cpp +++ llvm/lib/Transforms/Utils/Local.cpp @@ -499,9 +499,50 @@ if (Constant *C = dyn_cast(CI->getArgOperand(0))) return C->isNullValue() || isa(C); - if (auto *Call = dyn_cast(I)) + if (auto *Call = dyn_cast(I)) { if (isMathLibCallNoop(Call, TLI)) return true; + // Calls to function that are marked argmemonly, nothrow and contains one or + // more write-only params can be removed iff these params are from allocas + // where the alloca is used only by lifetime markers and the call itself. + auto canCallBeDeleted = [Call]() { + auto *F = Call->getCalledFunction(); + if (!F || !F->doesNotThrow() || !F->onlyAccessesArgMemory()) + return false; + // Are the arguments writeonly? + if (!llvm::all_of(F->args(), [](Argument &A) { + return A.hasAttribute(Attribute::WriteOnly); + })) + return false; + + for (auto &Arg : Call->args()) { + auto *AI = dyn_cast(&Arg); + if (!AI) + return false; + // The alloca should only be used in lifetime intrinsics and the call + // itself, i.e. the write-only argument has no semantic uses. + SmallVector AllocaUsers(AI->users()); + while (!AllocaUsers.empty()) { + auto *U = AllocaUsers.pop_back_val(); + auto *UserI = dyn_cast(U); + if (!UserI) + return false; + if (isa(UserI)) { + AllocaUsers.append(UserI->user_begin(), UserI->user_end()); + continue; + } + if (UserI == Call) + continue; + auto *II = dyn_cast(UserI); + if (!II || !II->isLifetimeStartOrEnd()) + return false; + } + } + return true; + }; + if (canCallBeDeleted()) + return true; + } // To express possible interaction with floating point environment constrained // intrinsics are described as if they access memory. So they look like having Index: llvm/test/Transforms/InstCombine/sink_sideeffecting_instruction.ll =================================================================== --- llvm/test/Transforms/InstCombine/sink_sideeffecting_instruction.ll +++ llvm/test/Transforms/InstCombine/sink_sideeffecting_instruction.ll @@ -1,5 +1,11 @@ ; RUN: opt -instcombine -S < %s | FileCheck %s +; Function Attrs: argmemonly nofree nosync nounwind willreturn +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #3 +; Function Attrs: argmemonly nofree nosync nounwind willreturn +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #3 +declare i32 @bar() + ; Function Attrs: noinline uwtable define dso_local i32 @foo(i32* nocapture writeonly %arg) local_unnamed_addr #0 { bb: @@ -35,12 +41,16 @@ ; return bar(); ; } -; TODO: We should be able to sink the second call @foo at bb5 down to bb_crit_edge +; We should be able to sink the second call @foo at bb5 down to bb_crit_edge define dso_local i32 @test() local_unnamed_addr #2 { ; CHECK-LABEL: test ; CHECK: bb5: -; CHECK: %tmp7 = call i32 @foo(i32* nonnull writeonly %tmp1) -; CHECK-NEXT: br i1 %tmp9, label %bb10, label %bb_crit_edge +; CHECK-NOT: %tmp7 = call i32 @foo(i32* nonnull writeonly %tmp1) +; CHECK: br i1 %tmp9, label %bb10, label %bb_crit_edge + +; CHECK-LABEL: bb_crit_edge: +; CHECK: %tmp7 = call i32 @foo(i32* nonnull writeonly %tmp1) + bb: %tmp = alloca i32, align 4 %tmp1 = alloca i32, align 4 @@ -76,12 +86,194 @@ ret i32 %tmp15 } -declare i32 @bar() -; Function Attrs: argmemonly nofree nosync nounwind willreturn -declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #3 +define dso_local i32 @test2(i32* %x) local_unnamed_addr #2 { +; CHECK-LABEL: test2 +; CHECK: bb5: +; CHECK: %tmp7 = call i32 @foo(i32* nonnull writeonly %tmp1) +; CHECK: br i1 %tmp9, label %bb10, label %bb_crit_edge -; Function Attrs: argmemonly nofree nosync nounwind willreturn -declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #3 +bb: + %tmp = alloca i32, align 4 + %tmp1 = alloca i32, align 4 + %tmp2 = bitcast i32* %tmp to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %tmp2) #4 + %tmp3 = call i32 @foo(i32* nonnull writeonly %tmp) + %tmp4 = icmp eq i32 %tmp3, 0 + br i1 %tmp4, label %bb5, label %bb14 + +bb5: ; preds = %bb + %tmp6 = bitcast i32* %tmp1 to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %tmp6) #4 + %tmp7 = call i32 @foo(i32* nonnull writeonly %tmp1) + %tmp8 = load i32, i32* %tmp, align 4, !tbaa !2 + %tmp9 = icmp eq i32 %tmp8, 0 + br i1 %tmp9, label %bb10, label %bb_crit_edge + +bb10: ; preds = %bb5 + %tmp11 = call i32 @bar() + br label %bb12 + +bb_crit_edge: + br label %bb12 + +bb12: ; preds = %bb10, %bb5 + %tmp13 = phi i32 [ %tmp11, %bb10 ], [ %tmp7, %bb_crit_edge ] + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %tmp6) #4 + br label %bb14 + +bb14: ; preds = %bb12, %bb + %tmp15 = phi i32 [ %tmp13, %bb12 ], [ 0, %bb ] + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %tmp2) + ret i32 %tmp15 +} + +declare noalias i8* @malloc(i32) willreturn nounwind +declare void @check2(i8*, i8*) +declare void @check(i8*) + +; TODO: We can sink down the allocation to its only use. +define i8** @test_allocation_sink_bc(i1 %cond) { +; CHECK-LABEL: @test_allocation_sink_bc( +; CHECK-NEXT: [[A:%.*]] = call dereferenceable_or_null(16000) i8* @malloc(i32 16000) +; CHECK-NEXT: br i1 [[COND:%.*]], label [[IF:%.*]], label [[ELSE:%.*]] +; CHECK: if: +; CHECK-NEXT: [[X:%.*]] = bitcast i8* [[A]] to i8** +; CHECK-NEXT: ret i8** [[X]] +; CHECK: else: +; CHECK-NEXT: [[B:%.*]] = call dereferenceable_or_null(24000) i8* @malloc(i32 24000) +; CHECK-NEXT: [[Y:%.*]] = bitcast i8* [[B]] to i8** +; CHECK-NEXT: ret i8** [[Y]] +; + %A = call i8* @malloc(i32 16000) + br i1 %cond, label %if, label %else + +if: + %X = bitcast i8* %A to i8** + ret i8** %X + + +else: + %B = call i8* @malloc(i32 24000) + %Y = bitcast i8* %B to i8** + ret i8** %Y +} + +; TODO: We can sink down the allocation to its only use. +define void @test_allocation_sink(i1 %cond) { +; CHECK-LABEL: @test_allocation_sink( +; CHECK-NEXT: [[A:%.*]] = call dereferenceable_or_null(16000) i8* @malloc(i32 16000) +; CHECK-NEXT: br i1 [[COND:%.*]], label [[IF:%.*]], label [[ELSE:%.*]] +; CHECK: if: +; CHECK-NEXT: call void @check(i8* [[A]]) +; CHECK-NEXT: ret void +; CHECK: else: +; CHECK-NEXT: ret void +; + %A = call i8* @malloc(i32 16000) + br i1 %cond, label %if, label %else + +if: + call void @check(i8* %A) + ret void + +else: + ret void +} + +declare i32 @dummy(i32, i32) +; TODO: We can sink this malloc as well down to the use block. +define void @test_allocation_sink_noalias_load(i1 %cond, i32 %i, i32* readonly %P) { +; CHECK-LABEL: @test_allocation_sink_noalias_load( +; CHECK-NEXT: [[A:%.*]] = call dereferenceable_or_null(16000) i8* @malloc(i32 16000) +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[I:%.*]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[L:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: br i1 [[COND:%.*]], label [[IF:%.*]], label [[ELSE:%.*]] +; CHECK: if: +; CHECK-NEXT: call void @check(i8* [[A]]) +; CHECK-NEXT: ret void +; CHECK: else: +; CHECK-NEXT: [[X:%.*]] = call i32 @dummy(i32 [[L]], i32 [[L]]) +; CHECK-NEXT: [[Y:%.*]] = call i32 @dummy(i32 [[L]], i32 [[L]]) +; CHECK-NEXT: ret void +; + %A = call i8* @malloc(i32 16000) + %idxprom = sext i32 %i to i64 + %arrayidx = getelementptr inbounds i32, i32* %P, i64 %idxprom + %L = load i32, i32* %arrayidx, align 4 + br i1 %cond, label %if, label %else + +if: + call void @check(i8* %A) + ret void + +else: + %X = call i32 @dummy(i32 %L, i32 %L) + %Y = call i32 @dummy(i32 %L, i32 %L) + ret void +} + +; TODO: We can sink both allocations to its only use without them being +; incorrectly reordered. +define void @test_allocation_sink2(i1 %cond) { +; CHECK-LABEL: @test_allocation_sink2( +; CHECK-NEXT: [[A:%.*]] = call dereferenceable_or_null(16000) i8* @malloc(i32 16000) +; CHECK-NEXT: [[B:%.*]] = call dereferenceable_or_null(16) i8* @malloc(i32 16) +; CHECK-NEXT: br i1 [[COND:%.*]], label [[IF:%.*]], label [[ELSE:%.*]] +; CHECK: if: +; CHECK-NEXT: call void @check2(i8* [[A]], i8* [[B]]) +; CHECK-NEXT: ret void +; CHECK: else: +; CHECK-NEXT: ret void +; + %A = call i8* @malloc(i32 16000) + %B = call i8* @malloc(i32 16) + br i1 %cond, label %if, label %else + +if: + call void @check2(i8* %A, i8* %B) + ret void + +else: + ret void +} + +declare i8* @llvm.stacksave() willreturn nounwind +declare void @llvm.stackrestore(i8*) willreturn nounwind +; We cannot sink the stacksave to the only use (stackrestore), since that incorrectly modifies +; stack state (dynamic alloca in between). +define void @test_neg_stacksave(i1 %cond, i32 %size) { +; CHECK-LABEL: @test_neg_stacksave( +; CHECK-NEXT: [[TMP:%.*]] = call i8* @llvm.stacksave() +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[SIZE:%.*]] to i64 +; CHECK-NEXT: [[A:%.*]] = alloca i8, i64 [[TMP1]], align 1 +; CHECK-NEXT: call void @check(i8* nonnull [[A]]) +; CHECK-NEXT: br label [[NEXT:%.*]] +; CHECK: next: +; CHECK-NEXT: call void @llvm.stackrestore(i8* [[TMP]]) +; CHECK-NEXT: br i1 [[COND:%.*]], label [[IF:%.*]], label [[ELSE:%.*]] +; CHECK: if: +; CHECK-NEXT: [[X:%.*]] = tail call i32 @bar() +; CHECK-NEXT: ret void +; CHECK: else: +; CHECK-NEXT: ret void +; + %tmp = call i8* @llvm.stacksave( ) + %A = alloca i8, i32 %size + call void @check(i8* %A) + br label %next + +next: + call void @llvm.stackrestore( i8* %tmp ) + br i1 %cond, label %if, label %else + +if: + %X = tail call i32 @bar() + ret void + +else: + ret void +} attributes #0 = { noinline uwtable argmemonly nounwind willreturn writeonly } attributes #3 = { argmemonly nofree nosync nounwind willreturn }