Index: llvm/include/llvm/Analysis/MemoryLocation.h
===================================================================
--- llvm/include/llvm/Analysis/MemoryLocation.h
+++ llvm/include/llvm/Analysis/MemoryLocation.h
@@ -253,6 +253,11 @@
   static MemoryLocation getForDest(const MemIntrinsic *MI);
   static MemoryLocation getForDest(const AtomicMemIntrinsic *MI);
   static MemoryLocation getForDest(const AnyMemIntrinsic *MI);
+  /// Return a location representing the destination of a generic call.
+  /// If a location is returned, client may assume that writing to dest is
+  /// the only side effect of the call, and that said call can be deleted,
+  /// sunk, reordered, etc... solely based on the memory semantics implied by
+  /// the described write, and the def-use chain of the result value (if any).
   static Optional<MemoryLocation> getForDest(const CallBase *CI,
                                              const TargetLibraryInfo &TLI);
 
Index: llvm/lib/Analysis/MemoryLocation.cpp
===================================================================
--- llvm/lib/Analysis/MemoryLocation.cpp
+++ llvm/lib/Analysis/MemoryLocation.cpp
@@ -147,7 +147,43 @@
     }
   }
 
-  return None;
+  // For a generic call, we don't consider it to have a "destination" unless
+  // the calls entirely semantics can be summarized by the write and the
+  // call's return value.  The majority of the complexity here is ensuring
+  // that we can so summarize.
+  if (!CB->onlyAccessesArgMemory() || !CB->willReturn() || !CB->doesNotThrow())
+    return None;
+
+  if (CB->hasOperandBundles())
+    // TODO: remove implementation restriction
+    return None;
+
+  Value *UsedV = nullptr;
+  for (unsigned i = 0; i < CB->arg_size(); i++) {
+    if (!CB->getArgOperand(i)->getType()->isPointerTy())
+      continue;
+    if (!CB->doesNotCapture(i))
+      // capture would allow the address to be read back in an untracked manner
+      return None;
+     if (CB->onlyReadsMemory(i))
+       continue;
+    if (!UsedV) {
+      // First potentially writing parameter
+      UsedV = CB->getArgOperand(i);
+      continue;
+    }
+    if (UsedV != CB->getArgOperand(i))
+      // Can't describe writing to two distinct locations.
+      // TODO: This results in an inprecision when two values derived from the
+      // same object are passed as arguments to the same function.
+      return None;
+  }
+  if (!UsedV)
+    // We don't currently have a way to represent a "does not write" result
+    // and thus have to be conservative and return unknown.
+    return None;
+
+  return MemoryLocation::getBeforeOrAfter(UsedV, CB->getAAMetadata());
 }
 
 MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
Index: llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
===================================================================
--- llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2562,35 +2562,21 @@
 
 /// Given a call CB which uses an address UsedV, return true if we can prove the
 /// call's only possible effect is storing to V.
-static bool isRemovableWrite(CallBase &CB, Value *UsedV) {
+static bool isRemovableWrite(CallBase &CB, Value *UsedV,
+                             const TargetLibraryInfo &TLI) {
   if (!CB.use_empty())
     // TODO: add recursion if returned attribute is present
     return false;
 
-  if (!CB.willReturn() || !CB.doesNotThrow() || !CB.onlyAccessesArgMemory() ||
-      CB.isTerminator())
+  if (CB.isTerminator())
+    // TODO: remove implementation restriction
     return false;
 
-  if (CB.hasOperandBundles())
-    return false;
-
-  for (unsigned i = 0; i < CB.arg_size(); i++) {
-    if (!CB.getArgOperand(i)->getType()->isPointerTy())
-      continue;
-    if (!CB.doesNotCapture(i))
-      // capture would allow the address to be read back in an untracked manner
-      return false;
-    if (UsedV != CB.getArgOperand(i) && !CB.onlyReadsMemory(i))
-      // A write to another memory location keeps the call live, and thus we
-      // must keep the alloca so that the call has somewhere to write to.
-      // TODO: This results in an inprecision when two values derived from the
-      // same alloca are passed as arguments to the same function.
-      return false;
-    // Note: Both reads from and writes to the alloca are fine.  Since the
-    // result is unused nothing can observe the values read from the alloca
-    // without writing it to some other observable location (checked above).
-  }
-  return true;
+  // If the only possible side effect of the call is writing to the alloca,
+  // and the result isn't used, we can safely remove any reads implied by the
+  // call including those which might read the alloca itself.
+  Optional<MemoryLocation> Dest = MemoryLocation::getForDest(&CB, TLI);
+  return Dest && Dest->Ptr == UsedV;
 }
 
 static bool isAllocSiteRemovable(Instruction *AI,
@@ -2660,7 +2646,7 @@
           }
         }
 
-        if (isRemovableWrite(*cast<CallBase>(I), PI)) {
+        if (isRemovableWrite(*cast<CallBase>(I), PI, TLI)) {
           Users.emplace_back(I);
           continue;
         }
Index: llvm/test/Transforms/DeadStoreElimination/trivial-dse-calls.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/DeadStoreElimination/trivial-dse-calls.ll
@@ -0,0 +1,308 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -dse -S < %s | FileCheck %s
+
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture)
+
+declare void @unknown()
+declare void @f(i8*)
+declare void @f2(i8*, i8*)
+
+; Basic case for DSEing a trivially dead writing call
+define void @test_dead() {
+; CHECK-LABEL: @test_dead(
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i32, align 4
+  %bitcast = bitcast i32* %a to i8*
+  call void @f(i8* writeonly nocapture %bitcast) argmemonly nounwind willreturn
+  ret void
+}
+
+; Add in canonical lifetime intrinsics
+define void @test_lifetime() {
+; CHECK-LABEL: @test_lifetime(
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast i32* [[A]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* [[BITCAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 4, i8* [[BITCAST]])
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i32, align 4
+  %bitcast = bitcast i32* %a to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %bitcast)
+  call void @f(i8* writeonly nocapture %bitcast) argmemonly nounwind willreturn
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %bitcast)
+  ret void
+}
+
+; Add some unknown calls just to point out that this is use based, not
+; instruction order sensitive
+define void @test_lifetime2() {
+; CHECK-LABEL: @test_lifetime2(
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast i32* [[A]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* [[BITCAST]])
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 4, i8* [[BITCAST]])
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i32, align 4
+  %bitcast = bitcast i32* %a to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %bitcast)
+  call void @unknown()
+  call void @f(i8* writeonly nocapture %bitcast) argmemonly nounwind willreturn
+  call void @unknown()
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %bitcast)
+  ret void
+}
+
+; As long as the result is unused, we can even remove reads of the alloca
+; itself since the write will be dropped.
+define void @test_dead_readwrite() {
+; CHECK-LABEL: @test_dead_readwrite(
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i32, align 4
+  %bitcast = bitcast i32* %a to i8*
+  call void @f(i8* nocapture %bitcast) argmemonly nounwind willreturn
+  ret void
+}
+
+define i32 @test_neg_read_after() {
+; CHECK-LABEL: @test_neg_read_after(
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast i32* [[A]] to i8*
+; CHECK-NEXT:    call void @f(i8* nocapture writeonly [[BITCAST]]) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    [[RES:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %a = alloca i32, align 4
+  %bitcast = bitcast i32* %a to i8*
+  call void @f(i8* writeonly nocapture %bitcast) argmemonly nounwind willreturn
+  %res = load i32, i32* %a
+  ret i32 %res
+}
+
+
+define void @test_neg_infinite_loop() {
+; CHECK-LABEL: @test_neg_infinite_loop(
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast i32* [[A]] to i8*
+; CHECK-NEXT:    call void @f(i8* nocapture writeonly [[BITCAST]]) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i32, align 4
+  %bitcast = bitcast i32* %a to i8*
+  call void @f(i8* writeonly nocapture %bitcast) argmemonly nounwind
+  ret void
+}
+
+define void @test_neg_throw() {
+; CHECK-LABEL: @test_neg_throw(
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast i32* [[A]] to i8*
+; CHECK-NEXT:    call void @f(i8* nocapture writeonly [[BITCAST]]) #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i32, align 4
+  %bitcast = bitcast i32* %a to i8*
+  call void @f(i8* writeonly nocapture %bitcast) argmemonly willreturn
+  ret void
+}
+
+define void @test_neg_extra_write() {
+; CHECK-LABEL: @test_neg_extra_write(
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast i32* [[A]] to i8*
+; CHECK-NEXT:    call void @f(i8* nocapture writeonly [[BITCAST]]) #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i32, align 4
+  %bitcast = bitcast i32* %a to i8*
+  call void @f(i8* writeonly nocapture %bitcast) nounwind willreturn
+  ret void
+}
+
+; In this case, we can't remove a1 because we need to preserve the write to
+; a2, and if we leave the call around, we need memory to pass to the first arg.
+define void @test_neg_unmodeled_write() {
+; CHECK-LABEL: @test_neg_unmodeled_write(
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast i32* [[A]] to i8*
+; CHECK-NEXT:    [[BITCAST2:%.*]] = bitcast i32* [[A2]] to i8*
+; CHECK-NEXT:    call void @f2(i8* nocapture writeonly [[BITCAST]], i8* [[BITCAST2]]) #[[ATTR1]]
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i32, align 4
+  %a2 = alloca i32, align 4
+  %bitcast = bitcast i32* %a to i8*
+  %bitcast2 = bitcast i32* %a2 to i8*
+  call void @f2(i8* nocapture writeonly %bitcast, i8* %bitcast2) argmemonly nounwind willreturn
+  ret void
+}
+
+define i32 @test_neg_captured_by_call() {
+; CHECK-LABEL: @test_neg_captured_by_call(
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A2:%.*]] = alloca i8*, align 4
+; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast i32* [[A]] to i8*
+; CHECK-NEXT:    [[BITCAST2:%.*]] = bitcast i8** [[A2]] to i8*
+; CHECK-NEXT:    call void @f2(i8* writeonly [[BITCAST]], i8* [[BITCAST2]]) #[[ATTR1]]
+; CHECK-NEXT:    [[A_COPY_CAST:%.*]] = load i8*, i8** [[A2]], align 8
+; CHECK-NEXT:    [[A_COPY:%.*]] = bitcast i8* [[A_COPY_CAST]] to i32*
+; CHECK-NEXT:    [[RES:%.*]] = load i32, i32* [[A_COPY]], align 4
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %a = alloca i32, align 4
+  %a2 = alloca i8*, align 4
+  %bitcast = bitcast i32* %a to i8*
+  %bitcast2 = bitcast i8** %a2 to i8*
+  call void @f2(i8* writeonly %bitcast, i8* %bitcast2) argmemonly nounwind willreturn
+  %a_copy_cast = load i8*, i8** %a2
+  %a_copy = bitcast i8* %a_copy_cast to i32*
+  %res = load i32, i32* %a_copy
+  ret i32 %res
+}
+
+define i32 @test_neg_captured_before() {
+; CHECK-LABEL: @test_neg_captured_before(
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A2:%.*]] = alloca i8*, align 4
+; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast i32* [[A]] to i8*
+; CHECK-NEXT:    [[BITCAST2:%.*]] = bitcast i8** [[A2]] to i8*
+; CHECK-NEXT:    store i8* [[BITCAST]], i8** [[A2]], align 8
+; CHECK-NEXT:    call void @f(i8* nocapture writeonly [[BITCAST]]) #[[ATTR1]]
+; CHECK-NEXT:    [[A_COPY_CAST:%.*]] = load i8*, i8** [[A2]], align 8
+; CHECK-NEXT:    [[A_COPY:%.*]] = bitcast i8* [[A_COPY_CAST]] to i32*
+; CHECK-NEXT:    [[RES:%.*]] = load i32, i32* [[A_COPY]], align 4
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %a = alloca i32, align 4
+  %a2 = alloca i8*, align 4
+  %bitcast = bitcast i32* %a to i8*
+  %bitcast2 = bitcast i8** %a2 to i8*
+  store i8* %bitcast, i8** %a2
+  call void @f(i8* writeonly nocapture %bitcast) argmemonly nounwind willreturn
+  %a_copy_cast = load i8*, i8** %a2
+  %a_copy = bitcast i8* %a_copy_cast to i32*
+  %res = load i32, i32* %a_copy
+  ret i32 %res
+}
+
+; Show that reading from unrelated memory is okay
+define void @test_unreleated_read() {
+; CHECK-LABEL: @test_unreleated_read(
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i32, align 4
+  %a2 = alloca i32, align 4
+  %bitcast = bitcast i32* %a to i8*
+  %bitcast2 = bitcast i32* %a2 to i8*
+  call void @f2(i8* nocapture writeonly %bitcast, i8* nocapture readonly %bitcast2) argmemonly nounwind willreturn
+  ret void
+}
+
+; But that removing a capture of an unrelated pointer isn't okay.
+define void @test_neg_unreleated_capture() {
+; CHECK-LABEL: @test_neg_unreleated_capture(
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast i32* [[A]] to i8*
+; CHECK-NEXT:    [[BITCAST2:%.*]] = bitcast i32* [[A2]] to i8*
+; CHECK-NEXT:    call void @f2(i8* nocapture writeonly [[BITCAST]], i8* readonly [[BITCAST2]]) #[[ATTR1]]
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i32, align 4
+  %a2 = alloca i32, align 4
+  %bitcast = bitcast i32* %a to i8*
+  %bitcast2 = bitcast i32* %a2 to i8*
+  call void @f2(i8* nocapture writeonly %bitcast, i8* readonly %bitcast2) argmemonly nounwind willreturn
+  ret void
+}
+
+; As long as the result is unused, we can even remove reads of the alloca
+; itself since the write will be dropped.
+define void @test_self_read() {
+; CHECK-LABEL: @test_self_read(
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i32, align 4
+  %bitcast = bitcast i32* %a to i8*
+  call void @f2(i8* nocapture writeonly %bitcast, i8* nocapture readonly %bitcast) argmemonly nounwind willreturn
+  ret void
+}
+
+; TODO: We should be able to remove the call because while we don't know
+; the size of the write done by the call, we do know the following store
+; writes to the entire contents of the alloca.
+define i32 @test_dse_overwrite() {
+; CHECK-LABEL: @test_dse_overwrite(
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast i32* [[A]] to i8*
+; CHECK-NEXT:    call void @f(i8* nocapture writeonly [[BITCAST]]) #[[ATTR1]]
+; CHECK-NEXT:    store i32 0, i32* [[A]], align 4
+; CHECK-NEXT:    [[V:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    ret i32 [[V]]
+;
+  %a = alloca i32, align 4
+  %bitcast = bitcast i32* %a to i8*
+  call void @f(i8* writeonly nocapture %bitcast) argmemonly nounwind willreturn
+  store i32 0, i32* %a
+  %v = load i32, i32* %a
+  ret i32 %v
+}
+
+; Negative case where we can read part of the value written by @f.
+define i32 @test_neg_dse_partial_overwrite() {
+; CHECK-LABEL: @test_neg_dse_partial_overwrite(
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast i32* [[A]] to i8*
+; CHECK-NEXT:    call void @f(i8* nocapture writeonly [[BITCAST]]) #[[ATTR1]]
+; CHECK-NEXT:    store i8 0, i8* [[BITCAST]], align 1
+; CHECK-NEXT:    [[V:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    ret i32 [[V]]
+;
+  %a = alloca i32, align 4
+  %bitcast = bitcast i32* %a to i8*
+  call void @f(i8* writeonly nocapture %bitcast) argmemonly nounwind willreturn
+  store i8 0, i8* %bitcast
+  %v = load i32, i32* %a
+  ret i32 %v
+}
+
+; Negative case where we don't know the size of a, and thus can't use the
+; full overwrite reasoning
+define i32 @test_neg_dse_unsized(i32* %a) {
+; CHECK-LABEL: @test_neg_dse_unsized(
+; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast i32* [[A:%.*]] to i8*
+; CHECK-NEXT:    call void @f(i8* nocapture writeonly [[BITCAST]]) #[[ATTR1]]
+; CHECK-NEXT:    store i32 0, i32* [[A]], align 4
+; CHECK-NEXT:    [[V:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    ret i32 [[V]]
+;
+  %bitcast = bitcast i32* %a to i8*
+  call void @f(i8* writeonly nocapture %bitcast) argmemonly nounwind willreturn
+  store i32 0, i32* %a
+  %v = load i32, i32* %a
+  ret i32 %v
+}
+
+
+@G = external global i8
+
+; TODO: Should be able to kill call in analogous manner to test_dse_overwrite.
+; Difference is non-alloca object.
+define void @test_dse_non_alloca() {
+; CHECK-LABEL: @test_dse_non_alloca(
+; CHECK-NEXT:    call void @f(i8* nocapture writeonly @G) #[[ATTR1]]
+; CHECK-NEXT:    store i8 0, i8* @G, align 1
+; CHECK-NEXT:    ret void
+;
+  call void @f(i8* writeonly nocapture @G) argmemonly nounwind willreturn
+  store i8 0, i8* @G
+  ret void
+}
+