Index: llvm/include/llvm/Analysis/MemoryLocation.h =================================================================== --- llvm/include/llvm/Analysis/MemoryLocation.h +++ llvm/include/llvm/Analysis/MemoryLocation.h @@ -253,6 +253,11 @@ static MemoryLocation getForDest(const MemIntrinsic *MI); static MemoryLocation getForDest(const AtomicMemIntrinsic *MI); static MemoryLocation getForDest(const AnyMemIntrinsic *MI); + /// Return a location representing the destination of a generic call. + /// If a location is returned, client may assume that writing to dest is + /// the only side effect of the call, and that said call can be deleted, + /// sunk, reordered, etc... solely based on the memory semantics implied by + /// the described write, and the def-use chain of the result value (if any). static Optional getForDest(const CallBase *CI, const TargetLibraryInfo &TLI); Index: llvm/lib/Analysis/MemoryLocation.cpp =================================================================== --- llvm/lib/Analysis/MemoryLocation.cpp +++ llvm/lib/Analysis/MemoryLocation.cpp @@ -147,7 +147,43 @@ } } - return None; + // For a generic call, we don't consider it to have a "destination" unless + // the calls entirely semantics can be summarized by the write and the + // call's return value. The majority of the complexity here is ensuring + // that we can so summarize. + if (!CB->onlyAccessesArgMemory() || !CB->willReturn() || !CB->doesNotThrow()) + return None; + + if (CB->hasOperandBundles()) + // TODO: remove implementation restriction + return None; + + Value *UsedV = nullptr; + for (unsigned i = 0; i < CB->arg_size(); i++) { + if (!CB->getArgOperand(i)->getType()->isPointerTy()) + continue; + if (!CB->doesNotCapture(i)) + // capture would allow the address to be read back in an untracked manner + return None; + if (CB->onlyReadsMemory(i)) + continue; + if (!UsedV) { + // First potentially writing parameter + UsedV = CB->getArgOperand(i); + continue; + } + if (UsedV != CB->getArgOperand(i)) + // Can't describe writing to two distinct locations. + // TODO: This results in an inprecision when two values derived from the + // same object are passed as arguments to the same function. + return None; + } + if (!UsedV) + // We don't currently have a way to represent a "does not write" result + // and thus have to be conservative and return unknown. + return None; + + return MemoryLocation::getBeforeOrAfter(UsedV, CB->getAAMetadata()); } MemoryLocation MemoryLocation::getForArgument(const CallBase *Call, Index: llvm/lib/Transforms/InstCombine/InstructionCombining.cpp =================================================================== --- llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2562,35 +2562,21 @@ /// Given a call CB which uses an address UsedV, return true if we can prove the /// call's only possible effect is storing to V. -static bool isRemovableWrite(CallBase &CB, Value *UsedV) { +static bool isRemovableWrite(CallBase &CB, Value *UsedV, + const TargetLibraryInfo &TLI) { if (!CB.use_empty()) // TODO: add recursion if returned attribute is present return false; - if (!CB.willReturn() || !CB.doesNotThrow() || !CB.onlyAccessesArgMemory() || - CB.isTerminator()) + if (CB.isTerminator()) + // TODO: remove implementation restriction return false; - if (CB.hasOperandBundles()) - return false; - - for (unsigned i = 0; i < CB.arg_size(); i++) { - if (!CB.getArgOperand(i)->getType()->isPointerTy()) - continue; - if (!CB.doesNotCapture(i)) - // capture would allow the address to be read back in an untracked manner - return false; - if (UsedV != CB.getArgOperand(i) && !CB.onlyReadsMemory(i)) - // A write to another memory location keeps the call live, and thus we - // must keep the alloca so that the call has somewhere to write to. - // TODO: This results in an inprecision when two values derived from the - // same alloca are passed as arguments to the same function. - return false; - // Note: Both reads from and writes to the alloca are fine. Since the - // result is unused nothing can observe the values read from the alloca - // without writing it to some other observable location (checked above). - } - return true; + // If the only possible side effect of the call is writing to the alloca, + // and the result isn't used, we can safely remove any reads implied by the + // call including those which might read the alloca itself. + Optional Dest = MemoryLocation::getForDest(&CB, TLI); + return Dest && Dest->Ptr == UsedV; } static bool isAllocSiteRemovable(Instruction *AI, @@ -2660,7 +2646,7 @@ } } - if (isRemovableWrite(*cast(I), PI)) { + if (isRemovableWrite(*cast(I), PI, TLI)) { Users.emplace_back(I); continue; } Index: llvm/test/Transforms/DeadStoreElimination/trivial-dse-calls.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/DeadStoreElimination/trivial-dse-calls.ll @@ -0,0 +1,308 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -dse -S < %s | FileCheck %s + +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) + +declare void @unknown() +declare void @f(i8*) +declare void @f2(i8*, i8*) + +; Basic case for DSEing a trivially dead writing call +define void @test_dead() { +; CHECK-LABEL: @test_dead( +; CHECK-NEXT: ret void +; + %a = alloca i32, align 4 + %bitcast = bitcast i32* %a to i8* + call void @f(i8* writeonly nocapture %bitcast) argmemonly nounwind willreturn + ret void +} + +; Add in canonical lifetime intrinsics +define void @test_lifetime() { +; CHECK-LABEL: @test_lifetime( +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[BITCAST:%.*]] = bitcast i32* [[A]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[BITCAST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[BITCAST]]) +; CHECK-NEXT: ret void +; + %a = alloca i32, align 4 + %bitcast = bitcast i32* %a to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %bitcast) + call void @f(i8* writeonly nocapture %bitcast) argmemonly nounwind willreturn + call void @llvm.lifetime.end.p0i8(i64 4, i8* %bitcast) + ret void +} + +; Add some unknown calls just to point out that this is use based, not +; instruction order sensitive +define void @test_lifetime2() { +; CHECK-LABEL: @test_lifetime2( +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[BITCAST:%.*]] = bitcast i32* [[A]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[BITCAST]]) +; CHECK-NEXT: call void @unknown() +; CHECK-NEXT: call void @unknown() +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[BITCAST]]) +; CHECK-NEXT: ret void +; + %a = alloca i32, align 4 + %bitcast = bitcast i32* %a to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %bitcast) + call void @unknown() + call void @f(i8* writeonly nocapture %bitcast) argmemonly nounwind willreturn + call void @unknown() + call void @llvm.lifetime.end.p0i8(i64 4, i8* %bitcast) + ret void +} + +; As long as the result is unused, we can even remove reads of the alloca +; itself since the write will be dropped. +define void @test_dead_readwrite() { +; CHECK-LABEL: @test_dead_readwrite( +; CHECK-NEXT: ret void +; + %a = alloca i32, align 4 + %bitcast = bitcast i32* %a to i8* + call void @f(i8* nocapture %bitcast) argmemonly nounwind willreturn + ret void +} + +define i32 @test_neg_read_after() { +; CHECK-LABEL: @test_neg_read_after( +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[BITCAST:%.*]] = bitcast i32* [[A]] to i8* +; CHECK-NEXT: call void @f(i8* nocapture writeonly [[BITCAST]]) #[[ATTR1:[0-9]+]] +; CHECK-NEXT: [[RES:%.*]] = load i32, i32* [[A]], align 4 +; CHECK-NEXT: ret i32 [[RES]] +; + %a = alloca i32, align 4 + %bitcast = bitcast i32* %a to i8* + call void @f(i8* writeonly nocapture %bitcast) argmemonly nounwind willreturn + %res = load i32, i32* %a + ret i32 %res +} + + +define void @test_neg_infinite_loop() { +; CHECK-LABEL: @test_neg_infinite_loop( +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[BITCAST:%.*]] = bitcast i32* [[A]] to i8* +; CHECK-NEXT: call void @f(i8* nocapture writeonly [[BITCAST]]) #[[ATTR2:[0-9]+]] +; CHECK-NEXT: ret void +; + %a = alloca i32, align 4 + %bitcast = bitcast i32* %a to i8* + call void @f(i8* writeonly nocapture %bitcast) argmemonly nounwind + ret void +} + +define void @test_neg_throw() { +; CHECK-LABEL: @test_neg_throw( +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[BITCAST:%.*]] = bitcast i32* [[A]] to i8* +; CHECK-NEXT: call void @f(i8* nocapture writeonly [[BITCAST]]) #[[ATTR3:[0-9]+]] +; CHECK-NEXT: ret void +; + %a = alloca i32, align 4 + %bitcast = bitcast i32* %a to i8* + call void @f(i8* writeonly nocapture %bitcast) argmemonly willreturn + ret void +} + +define void @test_neg_extra_write() { +; CHECK-LABEL: @test_neg_extra_write( +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[BITCAST:%.*]] = bitcast i32* [[A]] to i8* +; CHECK-NEXT: call void @f(i8* nocapture writeonly [[BITCAST]]) #[[ATTR4:[0-9]+]] +; CHECK-NEXT: ret void +; + %a = alloca i32, align 4 + %bitcast = bitcast i32* %a to i8* + call void @f(i8* writeonly nocapture %bitcast) nounwind willreturn + ret void +} + +; In this case, we can't remove a1 because we need to preserve the write to +; a2, and if we leave the call around, we need memory to pass to the first arg. +define void @test_neg_unmodeled_write() { +; CHECK-LABEL: @test_neg_unmodeled_write( +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[A2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[BITCAST:%.*]] = bitcast i32* [[A]] to i8* +; CHECK-NEXT: [[BITCAST2:%.*]] = bitcast i32* [[A2]] to i8* +; CHECK-NEXT: call void @f2(i8* nocapture writeonly [[BITCAST]], i8* [[BITCAST2]]) #[[ATTR1]] +; CHECK-NEXT: ret void +; + %a = alloca i32, align 4 + %a2 = alloca i32, align 4 + %bitcast = bitcast i32* %a to i8* + %bitcast2 = bitcast i32* %a2 to i8* + call void @f2(i8* nocapture writeonly %bitcast, i8* %bitcast2) argmemonly nounwind willreturn + ret void +} + +define i32 @test_neg_captured_by_call() { +; CHECK-LABEL: @test_neg_captured_by_call( +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[A2:%.*]] = alloca i8*, align 4 +; CHECK-NEXT: [[BITCAST:%.*]] = bitcast i32* [[A]] to i8* +; CHECK-NEXT: [[BITCAST2:%.*]] = bitcast i8** [[A2]] to i8* +; CHECK-NEXT: call void @f2(i8* writeonly [[BITCAST]], i8* [[BITCAST2]]) #[[ATTR1]] +; CHECK-NEXT: [[A_COPY_CAST:%.*]] = load i8*, i8** [[A2]], align 8 +; CHECK-NEXT: [[A_COPY:%.*]] = bitcast i8* [[A_COPY_CAST]] to i32* +; CHECK-NEXT: [[RES:%.*]] = load i32, i32* [[A_COPY]], align 4 +; CHECK-NEXT: ret i32 [[RES]] +; + %a = alloca i32, align 4 + %a2 = alloca i8*, align 4 + %bitcast = bitcast i32* %a to i8* + %bitcast2 = bitcast i8** %a2 to i8* + call void @f2(i8* writeonly %bitcast, i8* %bitcast2) argmemonly nounwind willreturn + %a_copy_cast = load i8*, i8** %a2 + %a_copy = bitcast i8* %a_copy_cast to i32* + %res = load i32, i32* %a_copy + ret i32 %res +} + +define i32 @test_neg_captured_before() { +; CHECK-LABEL: @test_neg_captured_before( +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[A2:%.*]] = alloca i8*, align 4 +; CHECK-NEXT: [[BITCAST:%.*]] = bitcast i32* [[A]] to i8* +; CHECK-NEXT: [[BITCAST2:%.*]] = bitcast i8** [[A2]] to i8* +; CHECK-NEXT: store i8* [[BITCAST]], i8** [[A2]], align 8 +; CHECK-NEXT: call void @f(i8* nocapture writeonly [[BITCAST]]) #[[ATTR1]] +; CHECK-NEXT: [[A_COPY_CAST:%.*]] = load i8*, i8** [[A2]], align 8 +; CHECK-NEXT: [[A_COPY:%.*]] = bitcast i8* [[A_COPY_CAST]] to i32* +; CHECK-NEXT: [[RES:%.*]] = load i32, i32* [[A_COPY]], align 4 +; CHECK-NEXT: ret i32 [[RES]] +; + %a = alloca i32, align 4 + %a2 = alloca i8*, align 4 + %bitcast = bitcast i32* %a to i8* + %bitcast2 = bitcast i8** %a2 to i8* + store i8* %bitcast, i8** %a2 + call void @f(i8* writeonly nocapture %bitcast) argmemonly nounwind willreturn + %a_copy_cast = load i8*, i8** %a2 + %a_copy = bitcast i8* %a_copy_cast to i32* + %res = load i32, i32* %a_copy + ret i32 %res +} + +; Show that reading from unrelated memory is okay +define void @test_unreleated_read() { +; CHECK-LABEL: @test_unreleated_read( +; CHECK-NEXT: ret void +; + %a = alloca i32, align 4 + %a2 = alloca i32, align 4 + %bitcast = bitcast i32* %a to i8* + %bitcast2 = bitcast i32* %a2 to i8* + call void @f2(i8* nocapture writeonly %bitcast, i8* nocapture readonly %bitcast2) argmemonly nounwind willreturn + ret void +} + +; But that removing a capture of an unrelated pointer isn't okay. +define void @test_neg_unreleated_capture() { +; CHECK-LABEL: @test_neg_unreleated_capture( +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[A2:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[BITCAST:%.*]] = bitcast i32* [[A]] to i8* +; CHECK-NEXT: [[BITCAST2:%.*]] = bitcast i32* [[A2]] to i8* +; CHECK-NEXT: call void @f2(i8* nocapture writeonly [[BITCAST]], i8* readonly [[BITCAST2]]) #[[ATTR1]] +; CHECK-NEXT: ret void +; + %a = alloca i32, align 4 + %a2 = alloca i32, align 4 + %bitcast = bitcast i32* %a to i8* + %bitcast2 = bitcast i32* %a2 to i8* + call void @f2(i8* nocapture writeonly %bitcast, i8* readonly %bitcast2) argmemonly nounwind willreturn + ret void +} + +; As long as the result is unused, we can even remove reads of the alloca +; itself since the write will be dropped. +define void @test_self_read() { +; CHECK-LABEL: @test_self_read( +; CHECK-NEXT: ret void +; + %a = alloca i32, align 4 + %bitcast = bitcast i32* %a to i8* + call void @f2(i8* nocapture writeonly %bitcast, i8* nocapture readonly %bitcast) argmemonly nounwind willreturn + ret void +} + +; TODO: We should be able to remove the call because while we don't know +; the size of the write done by the call, we do know the following store +; writes to the entire contents of the alloca. +define i32 @test_dse_overwrite() { +; CHECK-LABEL: @test_dse_overwrite( +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[BITCAST:%.*]] = bitcast i32* [[A]] to i8* +; CHECK-NEXT: call void @f(i8* nocapture writeonly [[BITCAST]]) #[[ATTR1]] +; CHECK-NEXT: store i32 0, i32* [[A]], align 4 +; CHECK-NEXT: [[V:%.*]] = load i32, i32* [[A]], align 4 +; CHECK-NEXT: ret i32 [[V]] +; + %a = alloca i32, align 4 + %bitcast = bitcast i32* %a to i8* + call void @f(i8* writeonly nocapture %bitcast) argmemonly nounwind willreturn + store i32 0, i32* %a + %v = load i32, i32* %a + ret i32 %v +} + +; Negative case where we can read part of the value written by @f. +define i32 @test_neg_dse_partial_overwrite() { +; CHECK-LABEL: @test_neg_dse_partial_overwrite( +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[BITCAST:%.*]] = bitcast i32* [[A]] to i8* +; CHECK-NEXT: call void @f(i8* nocapture writeonly [[BITCAST]]) #[[ATTR1]] +; CHECK-NEXT: store i8 0, i8* [[BITCAST]], align 1 +; CHECK-NEXT: [[V:%.*]] = load i32, i32* [[A]], align 4 +; CHECK-NEXT: ret i32 [[V]] +; + %a = alloca i32, align 4 + %bitcast = bitcast i32* %a to i8* + call void @f(i8* writeonly nocapture %bitcast) argmemonly nounwind willreturn + store i8 0, i8* %bitcast + %v = load i32, i32* %a + ret i32 %v +} + +; Negative case where we don't know the size of a, and thus can't use the +; full overwrite reasoning +define i32 @test_neg_dse_unsized(i32* %a) { +; CHECK-LABEL: @test_neg_dse_unsized( +; CHECK-NEXT: [[BITCAST:%.*]] = bitcast i32* [[A:%.*]] to i8* +; CHECK-NEXT: call void @f(i8* nocapture writeonly [[BITCAST]]) #[[ATTR1]] +; CHECK-NEXT: store i32 0, i32* [[A]], align 4 +; CHECK-NEXT: [[V:%.*]] = load i32, i32* [[A]], align 4 +; CHECK-NEXT: ret i32 [[V]] +; + %bitcast = bitcast i32* %a to i8* + call void @f(i8* writeonly nocapture %bitcast) argmemonly nounwind willreturn + store i32 0, i32* %a + %v = load i32, i32* %a + ret i32 %v +} + + +@G = external global i8 + +; TODO: Should be able to kill call in analogous manner to test_dse_overwrite. +; Difference is non-alloca object. +define void @test_dse_non_alloca() { +; CHECK-LABEL: @test_dse_non_alloca( +; CHECK-NEXT: call void @f(i8* nocapture writeonly @G) #[[ATTR1]] +; CHECK-NEXT: store i8 0, i8* @G, align 1 +; CHECK-NEXT: ret void +; + call void @f(i8* writeonly nocapture @G) argmemonly nounwind willreturn + store i8 0, i8* @G + ret void +} +