Index: lib/CodeGen/SelectionDAG/StatepointLowering.h
===================================================================
--- lib/CodeGen/SelectionDAG/StatepointLowering.h
+++ lib/CodeGen/SelectionDAG/StatepointLowering.h
@@ -43,8 +43,8 @@
 
   /// Returns the spill location of a value incoming to the current
   /// statepoint.  Will return SDValue() if this value hasn't been
-  /// spilled.  Otherwise, the value has already been spilled and no
-  /// further action is required by the caller.
+  /// assigned a spill slot.  Note that the value may have been assigned a
+  /// spill slot, but not yet actually spilled.
   SDValue getLocation(SDValue val) {
     if (!Locations.count(val))
       return SDValue();
Index: lib/CodeGen/SelectionDAG/StatepointLowering.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -114,8 +114,12 @@
 }
 
 /// Utility function for reservePreviousStackSlotForValue. Tries to find
-/// stack slot index to which we have spilled value for previous statepoints.
-/// LookUpDepth specifies maximum DFS depth this function is allowed to look.
+/// stack slot index which is obviously profitable to use for this value.
+/// Looks for values which spills of the same value, values generated from
+/// reloads, and simple updates to previously spilled values.  LookUpDepth
+/// specifies maximum DFS depth this function is allowed to look. This function
+/// doesn't worry about conflicting spill slots, that's handled by it's
+/// caller. 
 static Optional<int> findPreviousSpillSlot(const Value *Val,
                                            SelectionDAGBuilder &Builder,
                                            int LookUpDepth) {
@@ -137,11 +141,38 @@
     return It->second;
   }
 
-  // Look through bitcast instructions.
+  // Look through bitcast instructions.  We can do this since bitcast don't
+  // change the value in the slot and we can store both the original and
+  // bitcasted value in the same slot.
   if (const BitCastInst *Cast = dyn_cast<BitCastInst>(Val)) {
     return findPreviousSpillSlot(Cast->getOperand(0), Builder, LookUpDepth - 1);
   }
 
+  // We can place a new value in an existing stack slot if we can be confident
+  // that the original value won't be live over the statepoint and thus
+  // preferrable to put in that slot.  Note that hasOneUse check is an
+  // optimization heuristic, not a correctness check.  Conflicting reservations
+  // are handled explicitly in our caller.  Single use updates have a
+  // particularly nice property on x86 in that many of them can be folded to
+  // updates against memory if the same spill slot can be used on both sides.
+  // This really helps cases like:
+  //   statepoint @foo() (p)
+  //   p2 = gep p+1
+  //   statepoint @bar() (p2) <-- where p is dead here
+  // Note that the spill slot reuse works for deopt state as well, but we don't
+  // get the nice update in place property since the data dependency in on p,
+  // not reload(spill(p)) and the register allocator doesn't know that the
+  // spill slot contains the value of (p).  This is probably fixable.
+  if (Val->hasOneUse()) {
+    if (auto *GEP = dyn_cast<GetElementPtrInst>(Val))
+      return findPreviousSpillSlot(GEP->getPointerOperand(), Builder,
+                                   LookUpDepth - 1);
+    if (auto *BOp = dyn_cast<BinaryOperator>(Val))
+      if (isa<Constant>(BOp->getOperand(1)))
+        return findPreviousSpillSlot(BOp->getOperand(0), Builder,
+                                     LookUpDepth - 1);
+  }
+
   // Look through phi nodes
   // All incoming values should have same known stack slot, otherwise result
   // is unknown.
@@ -173,22 +204,6 @@
   // in example we still need to emit store, but instead of any location
   // we need to use special "preferred" location.
 
-  // TODO: handle simple updates.  If a value is modified and the original
-  // value is no longer live, it would be nice to put the modified value in the
-  // same slot.  This allows folding of the memory accesses for some
-  // instructions types (like an increment).
-  //   statepoint (i)
-  //   i1 = i+1
-  //   statepoint (i1)
-  // However we need to be careful for cases like this:
-  //   statepoint(i)
-  //   i1 = i+1
-  //   statepoint(i, i1)
-  // Here we want to reserve spill slot for 'i', but not for 'i+1'. If we just
-  // put handling of simple modifications in this function like it's done
-  // for bitcasts we might end up reserving i's slot for 'i+1' because order in
-  // which we visit values is unspecified.
-
   // Don't know any information about this instruction
   return Optional<int>();
 }
@@ -197,7 +212,9 @@
 /// statepoint spilling.  If we can find a spill slot for the incoming value,
 /// mark that slot as allocated, and reuse the same slot for this safepoint.
 /// This helps to avoid series of loads and stores that only serve to resuffle
-/// values on the stack between calls.
+/// values on the stack between calls.  We may also be able to reuse an
+/// existing slot if we can trivially tell the original value isn't live at
+/// this statepoint but a value computed from it is.
 static void reservePreviousStackSlotForValue(const Value *IncomingValue,
                                              SelectionDAGBuilder &Builder) {
 
@@ -230,11 +247,6 @@
       std::distance(Builder.FuncInfo.StatepointStackSlots.begin(), Itr);
   if (Builder.StatepointLowering.isStackSlotAllocated(Offset)) {
     // stack slot already assigned to someone else, can't use it!
-    // TODO: currently we reserve space for gc arguments after doing
-    // normal allocation for deopt arguments.  We should reserve for
-    // _all_ deopt and gc arguments, then start allocating.  This
-    // will prevent some moves being inserted when vm state changes,
-    // but gc state doesn't between two calls.
     return;
   }
   // Reserve this stack slot
@@ -399,25 +411,29 @@
                              SelectionDAGBuilder &Builder) {
   SDValue Loc = Builder.StatepointLowering.getLocation(Incoming);
 
-  // Emit new store if we didn't do it for this ptr before
+  // If we couldn't find a profitable stack slot previously, assign one.
   if (!Loc.getNode()) {
     Loc = Builder.StatepointLowering.allocateStackSlot(Incoming.getValueType(),
                                                        Builder);
     assert(isa<FrameIndexSDNode>(Loc));
-    int Index = cast<FrameIndexSDNode>(Loc)->getIndex();
-    // We use TargetFrameIndex so that isel will not select it into LEA
-    Loc = Builder.DAG.getTargetFrameIndex(Index, Incoming.getValueType());
-
-    // TODO: We can create TokenFactor node instead of
-    //       chaining stores one after another, this may allow
-    //       a bit more optimal scheduling for them
-    Chain = Builder.DAG.getStore(Chain, Builder.getCurSDLoc(), Incoming, Loc,
-                                 MachinePointerInfo::getFixedStack(Index),
-                                 false, false, 0);
-
     Builder.StatepointLowering.setLocation(Incoming, Loc);
   }
 
+  // Now spill the value.  When we have multiple values which map the same
+  // stack slot this may result in redundant spills, but we can generally rely
+  // on the backend to clean these up for us and don't need to be fancy here.
+  
+  int Index = cast<FrameIndexSDNode>(Loc)->getIndex();
+  // We use TargetFrameIndex so that isel will not select it into LEA
+  Loc = Builder.DAG.getTargetFrameIndex(Index, Incoming.getValueType());
+  
+  // TODO: We can create TokenFactor node instead of
+  //       chaining stores one after another, this may allow
+  //       a bit more optimal scheduling for them
+  Chain = Builder.DAG.getStore(Chain, Builder.getCurSDLoc(), Incoming, Loc,
+                               MachinePointerInfo::getFixedStack(Index),
+                               false, false, 0);
+
   assert(Loc.getNode());
   return std::make_pair(Loc, Chain);
 }
Index: test/CodeGen/X86/statepoint-stack-usage.ll
===================================================================
--- test/CodeGen/X86/statepoint-stack-usage.ll
+++ test/CodeGen/X86/statepoint-stack-usage.ll
@@ -8,7 +8,7 @@
 ; of GC arguments differ, niave lowering code would insert loads and 
 ; stores to rearrange items on the stack.  We need to make sure (for
 ; performance) that this doesn't happen.
-define i32 @back_to_back_calls(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #1 gc "statepoint-example" {
+define i32 @back_to_back_calls(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) gc "statepoint-example" {
 ; CHECK-LABEL: back_to_back_calls
 ; The exact stores don't matter, but there need to be three stack slots created
 ; CHECK: movq	%rdi, 16(%rsp)
@@ -31,7 +31,7 @@
 
 ; This test simply checks that minor changes in vm state don't prevent slots
 ; being reused for gc values.  
-define i32 @reserve_first(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #1 gc "statepoint-example" {
+define i32 @reserve_first(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) gc "statepoint-example" {
 ; CHECK-LABEL: reserve_first
 ; The exact stores don't matter, but there need to be three stack slots created
 ; CHECK: movq	%rdi, 16(%rsp)
@@ -53,7 +53,7 @@
 }
 
 ; Test that stack slots are reused for invokes
-define i32 @back_to_back_invokes(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #1 gc "statepoint-example" {
+define i32 @back_to_back_invokes(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) gc "statepoint-example" {
 ; CHECK-LABEL: back_to_back_invokes
 entry:
   ; The exact stores don't matter, but there need to be three stack slots created
@@ -93,12 +93,89 @@
   ret i32 0
 }
 
+; A simple forwarding case where the same value is used across two
+; safepoints.  We'll build on this pattern for other tests.
+define i32 addrspace(1)* @test3(i32 addrspace(1)* %a) gc "statepoint-example" {
+; CHECK-LABEL: test3
+; CHECK:        movq %rdi, (%rsp)
+; CHECK-NEXT:   callq
+; CHECK-NOT:    movq
+; CHECK:        callq
+; CHECK:        movq (%rsp), %rax
+
+  %safepoint_token = tail call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %a)
+  %a1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 7, i32 7)
+  %safepoint_token2 = tail call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %a1)
+  %a2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 7, i32 7)
+  ret i32 addrspace(1)* %a2
+}
+
+; A bitcast does not change the value stored, we can record the same stack slot
+; for both a bit cast and it's source value.  We'll end up with one stack slot,
+; but two locations in the stackmap.
+define i32 addrspace(1)* @test4(i32 addrspace(1)* %a) gc "statepoint-example" {
+; CHECK-LABEL: test4
+; CHECK:        movq %rdi, (%rsp)
+; CHECK-NEXT:   callq
+; CHECK-NOT:    movq
+; CHECK:        callq
+; CHECK:        movq (%rsp), %rax
+
+  %safepoint_token = tail call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %a)
+  %a1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 7, i32 7)
+  %a1_cast = bitcast i32 addrspace(1)* %a1 to i64 addrspace(1)*
+  %safepoint_token2 = tail call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %a1, i64 addrspace(1)* %a1_cast)
+  %unused = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 7, i32 7)
+  %a2 = tail call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(i32 %safepoint_token2, i32 8, i32 8)
+  %a2_cast = bitcast i64 addrspace(1)* %a2 to i32 addrspace(1)*
+  ret i32 addrspace(1)* %a2_cast
+}
+
+; For certain RMW idioms between statepoints, we can reuse the original stack 
+; slot and update the memory slot directly.  This is obviously profitable if 
+; the original value is only live for the update.
+define i32 addrspace(1)* @test5(i32 addrspace(1)* %a) gc "statepoint-example" {
+; CHECK-LABEL: test5
+; CHECK:        movq %rdi, (%rsp)
+; CHECK-NEXT:   callq
+; CHECK-NOT:    movq
+; CHECK:        addq $32, (%rsp)
+; CHECK-NEXT:   callq
+; CHECK:        movq (%rsp), %rax
+
+  %safepoint_token = tail call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %a)
+  %a1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 7, i32 7)
+  %a1_derived = getelementptr i32, i32 addrspace(1)* %a1, i64 8
+  %safepoint_token2 = tail call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %a1_derived)
+  %relocated = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 7, i32 7)
+  ret i32 addrspace(1)* %relocated
+}
+
+; Same as test5, but with an integer add operation on deopt state rather than
+; a GEP on a GC pointer.  Note that we don't get update in place at the moment.
+define void @test6(i32 %a) gc "statepoint-example" {
+; CHECK-LABEL: test6
+; CHECK:        movl	%edi, %ebx
+; CHECK-NEXT:   movl %ebx, 12(%rsp)
+; CHECK-NEXT:   callq
+; CHECK-NOT:    movq
+; CHECK:        incl %ebx
+; CHECK-NEXT:   movl %ebx, 12(%rsp)
+; CHECK-NEXT:   callq
+
+  call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 1, i32 %a)
+  %a1_add = add i32 %a, 1
+  tail call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 1, i32 %a1_add)
+  ret void
+}
+
+
 ; Function Attrs: nounwind
 declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32, i32, i32) #3
+declare i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(i32, i32, i32) #3
 declare i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(i32, i32, i32) #3
 
 declare i32 @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)
 
 declare i32 @"personality_function"()
 
-attributes #1 = { uwtable }