Index: llvm/trunk/include/llvm/IR/Statepoint.h
===================================================================
--- llvm/trunk/include/llvm/IR/Statepoint.h
+++ llvm/trunk/include/llvm/IR/Statepoint.h
@@ -34,8 +34,15 @@
   None = 0,
   GCTransition = 1, ///< Indicates that this statepoint is a transition from
                     ///< GC-aware code to code that is not GC-aware.
+  /// Mark the deopt arguments associated with the statepoint as only being
+  /// "live-in". By default, deopt arguments are "live-through".  "live-through"
+  /// requires that they the value be live on entry, on exit, and at any point
+  /// during the call.  "live-in" only requires the value be available at the
+  /// start of the call.  In particular, "live-in" values can be placed in
+  /// unused argument registers or other non-callee saved registers.
+  DeoptLiveIn = 2,
 
-  MaskAll = GCTransition ///< A bitmask that includes all valid flags.
+  MaskAll = 3 ///< A bitmask that includes all valid flags.
 };
 
 class GCRelocateInst;
Index: llvm/trunk/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
===================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ llvm/trunk/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -370,7 +370,7 @@
 /// Lower a single value incoming to a statepoint node.  This value can be
 /// either a deopt value or a gc value, the handling is the same.  We special
 /// case constants and allocas, then fall back to spilling if required.
-static void lowerIncomingStatepointValue(SDValue Incoming,
+static void lowerIncomingStatepointValue(SDValue Incoming, bool LiveInOnly,
                                          SmallVectorImpl<SDValue> &Ops,
                                          SelectionDAGBuilder &Builder) {
   SDValue Chain = Builder.getRoot();
@@ -389,6 +389,14 @@
     // relocate the address of the alloca itself?)
     Ops.push_back(Builder.DAG.getTargetFrameIndex(FI->getIndex(),
                                                   Incoming.getValueType()));
+  } else if (LiveInOnly) {
+    // If this value is live in (not live-on-return, or live-through), we can
+    // treat it the same way patchpoint treats it's "live in" values.  We'll 
+    // end up folding some of these into stack references, but they'll be 
+    // handled by the register allocator.  Note that we do not have the notion
+    // of a late use so these values might be placed in registers which are 
+    // clobbered by the call.  This is fine for live-in.
+    Ops.push_back(Incoming);
   } else {
     // Otherwise, locate a spill slot and explicitly spill it so it
     // can be found by the runtime later.  We currently do not support
@@ -439,19 +447,38 @@
                "non gc managed derived pointer found in statepoint");
       }
     }
+    assert(SI.Bases.size() == SI.Ptrs.size() && "Pointer without base!");
   } else {
     assert(SI.Bases.empty() && "No gc specified, so cannot relocate pointers!");
     assert(SI.Ptrs.empty() && "No gc specified, so cannot relocate pointers!");
   }
 #endif
 
+  // Figure out what lowering strategy we're going to use for each part
+  // Note: Is is conservatively correct to lower both "live-in" and "live-out"
+  // as "live-through". A "live-through" variable is one which is "live-in",
+  // "live-out", and live throughout the lifetime of the call (i.e. we can find
+  // it from any PC within the transitive callee of the statepoint).  In
+  // particular, if the callee spills callee preserved registers we may not
+  // be able to find a value placed in that register during the call.  This is
+  // fine for live-out, but not for live-through.  If we were willing to make
+  // assumptions about the code generator producing the callee, we could
+  // potentially allow live-through values in callee saved registers.
+  const bool LiveInDeopt =
+    SI.StatepointFlags & (uint64_t)StatepointFlags::DeoptLiveIn;
+
+  auto isGCValue =[&](const Value *V) {
+    return is_contained(SI.Ptrs, V) || is_contained(SI.Bases, V);
+  };
+  
   // Before we actually start lowering (and allocating spill slots for values),
   // reserve any stack slots which we judge to be profitable to reuse for a
   // particular value.  This is purely an optimization over the code below and
   // doesn't change semantics at all.  It is important for performance that we
   // reserve slots for both deopt and gc values before lowering either.
   for (const Value *V : SI.DeoptState) {
-    reservePreviousStackSlotForValue(V, Builder);
+    if (!LiveInDeopt || isGCValue(V))
+      reservePreviousStackSlotForValue(V, Builder);
   }
   for (unsigned i = 0; i < SI.Bases.size(); ++i) {
     reservePreviousStackSlotForValue(SI.Bases[i], Builder);
@@ -468,7 +495,8 @@
   // what type of values are contained within.
   for (const Value *V : SI.DeoptState) {
     SDValue Incoming = Builder.getValue(V);
-    lowerIncomingStatepointValue(Incoming, Ops, Builder);
+    const bool LiveInValue = LiveInDeopt && !isGCValue(V);
+    lowerIncomingStatepointValue(Incoming, LiveInValue, Ops, Builder);
   }
 
   // Finally, go ahead and lower all the gc arguments.  There's no prefixed
@@ -478,10 +506,12 @@
   // (base[0], ptr[0], base[1], ptr[1], ...)
   for (unsigned i = 0; i < SI.Bases.size(); ++i) {
     const Value *Base = SI.Bases[i];
-    lowerIncomingStatepointValue(Builder.getValue(Base), Ops, Builder);
+    lowerIncomingStatepointValue(Builder.getValue(Base), /*LiveInOnly*/ false,
+                                 Ops, Builder);
 
     const Value *Ptr = SI.Ptrs[i];
-    lowerIncomingStatepointValue(Builder.getValue(Ptr), Ops, Builder);
+    lowerIncomingStatepointValue(Builder.getValue(Ptr), /*LiveInOnly*/ false,
+                                 Ops, Builder);
   }
 
   // If there are any explicit spill slots passed to the statepoint, record
Index: llvm/trunk/lib/CodeGen/TargetInstrInfo.cpp
===================================================================
--- llvm/trunk/lib/CodeGen/TargetInstrInfo.cpp
+++ llvm/trunk/lib/CodeGen/TargetInstrInfo.cpp
@@ -448,6 +448,11 @@
     StartIdx = PatchPointOpers(&MI).getVarIdx();
     break;
   }
+  case TargetOpcode::STATEPOINT: {
+    // For statepoints, fold deopt and gc arguments, but not call arguments.
+    StartIdx = StatepointOpers(&MI).getVarIdx();
+    break;
+  }
   default:
     llvm_unreachable("unexpected stackmap opcode");
   }
@@ -513,7 +518,8 @@
   MachineInstr *NewMI = nullptr;
 
   if (MI.getOpcode() == TargetOpcode::STACKMAP ||
-      MI.getOpcode() == TargetOpcode::PATCHPOINT) {
+      MI.getOpcode() == TargetOpcode::PATCHPOINT ||
+      MI.getOpcode() == TargetOpcode::STATEPOINT) {
     // Fold stackmap/patchpoint.
     NewMI = foldPatchpoint(MF, MI, Ops, FI, *this);
     if (NewMI)
@@ -794,7 +800,8 @@
   int FrameIndex = 0;
 
   if ((MI.getOpcode() == TargetOpcode::STACKMAP ||
-       MI.getOpcode() == TargetOpcode::PATCHPOINT) &&
+       MI.getOpcode() == TargetOpcode::PATCHPOINT ||
+       MI.getOpcode() == TargetOpcode::STATEPOINT) &&
       isLoadFromStackSlot(LoadMI, FrameIndex)) {
     // Fold stackmap/patchpoint.
     NewMI = foldPatchpoint(MF, MI, Ops, FrameIndex, *this);
Index: llvm/trunk/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
===================================================================
--- llvm/trunk/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ llvm/trunk/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -1273,6 +1273,24 @@
 };
 }
 
+static StringRef getDeoptLowering(CallSite CS) {
+  const char *DeoptLowering = "deopt-lowering";
+  if (CS.hasFnAttr(DeoptLowering)) {
+    // FIXME: CallSite has a *really* confusing interface around attributes
+    // with values.  
+    const AttributeSet &CSAS = CS.getAttributes();
+    if (CSAS.hasAttribute(AttributeSet::FunctionIndex,
+                          DeoptLowering))
+      return CSAS.getAttribute(AttributeSet::FunctionIndex,
+                               DeoptLowering).getValueAsString();
+    Function *F = CS.getCalledFunction();
+    assert(F && F->hasFnAttribute(DeoptLowering));
+    return F->getFnAttribute(DeoptLowering).getValueAsString();
+  }
+  return "live-through";
+}
+    
+
 static void
 makeStatepointExplicitImpl(const CallSite CS, /* to replace */
                            const SmallVectorImpl<Value *> &BasePtrs,
@@ -1314,6 +1332,14 @@
   if (SD.StatepointID)
     StatepointID = *SD.StatepointID;
 
+  // Pass through the requested lowering if any.  The default is live-through.
+  StringRef DeoptLowering = getDeoptLowering(CS);
+  if (DeoptLowering.equals("live-in"))
+    Flags |= uint32_t(StatepointFlags::DeoptLiveIn);
+  else {
+    assert(DeoptLowering.equals("live-through") && "Unsupported value!");
+  }
+
   Value *CallTarget = CS.getCalledValue();
   if (Function *F = dyn_cast<Function>(CallTarget)) {
     if (F->getIntrinsicID() == Intrinsic::experimental_deoptimize) {
Index: llvm/trunk/test/CodeGen/X86/statepoint-live-in.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/statepoint-live-in.ll
+++ llvm/trunk/test/CodeGen/X86/statepoint-live-in.ll
@@ -0,0 +1,131 @@
+; RUN: llc -O3 < %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+declare void @bar() #0
+declare void @baz()
+
+define void @test1(i32 %a) gc "statepoint-example" {
+entry:
+; We expect the argument to be passed in an extra register to bar
+; CHECK-LABEL: test1
+; CHECK:       pushq	%rax
+; CHECK-NEXT: Ltmp0:
+; CHECK-NEXT:  .cfi_def_cfa_offset 16
+; CHECK-NEXT: callq	_bar
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 1, i32 %a)
+  ret void
+}
+
+define void @test2(i32 %a, i32 %b) gc "statepoint-example" {
+entry:
+; Because the first call clobbers esi, we have to move the values into
+; new registers.  Note that they stay in the registers for both calls.
+; CHECK-LABEL: @test2
+; CHECK:       movl	%esi, %ebx
+; CHECK-NEXT:  movl	%edi, %ebp
+; CHECK-NEXT: callq	_bar
+  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 2, i32 %a, i32 %b)
+  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 2, i32 %b, i32 %a)
+  ret void
+}
+
+define void @test3(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i) gc "statepoint-example" {
+entry:
+; TODO: We should have folded the reload into the statepoint.
+; CHECK-LABEL: @test3
+; CHECK:       	movl	32(%rsp), %r10d
+; CHECK-NEXT: 	movl	24(%rsp), %r11d
+; CHECK-NEXT:   movl	16(%rsp), %eax
+; CHECK-NEXT:   callq	_bar
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 9, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i)
+  ret void
+}
+
+; This case just confirms that we don't crash when given more live values
+; than registers.  This is a case where we *have* to use a stack slot.
+define void @test4(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z) gc "statepoint-example" {
+entry:
+; TODO: We should have folded the reload into the statepoint.  
+; CHECK-LABEL: test4
+; CHECK: pushq %r15
+; CHECK: pushq %r14
+; CHECK: pushq %r13
+; CHECK: pushq %r12
+; CHECK: pushq %rbx
+; CHECK: pushq %rax
+; CHECK:       	movl	128(%rsp), %r13d
+; CHECK-NEXT:   movl	120(%rsp), %r12d
+; CHECK-NEXT:   movl	112(%rsp), %r15d
+; CHECK-NEXT:   movl	104(%rsp), %r14d
+; CHECK-NEXT:   movl	96(%rsp), %ebp
+; CHECK-NEXT:   movl	88(%rsp), %ebx
+; CHECK-NEXT:   movl	80(%rsp), %r11d
+; CHECK-NEXT:   movl	72(%rsp), %r10d
+; CHECK-NEXT:   movl	64(%rsp), %eax
+; CHECK-NEXT:   callq	_bar
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 26, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z)
+  ret void
+}
+
+; A live-through gc-value must be spilled even if it is also a live-in deopt
+; value.  For live-in, we could technically report the register copy, but from
+; a code quality perspective it's better to reuse the required stack slot so 
+; as to put less stress on the register allocator for no benefit.
+define  i32 addrspace(1)* @test5(i32 %a, i32 addrspace(1)* %p) gc "statepoint-example" {
+entry:
+; CHECK-LABEL: test5
+; CHECK:        movq	%rsi, (%rsp)
+; CHECK-NEXT:   callq	_bar
+  %token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 1, i32 %a, i32 addrspace(1)* %p, i32 addrspace(1)* %p)
+  %p2 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %token,  i32 9, i32 9)
+  ret i32 addrspace(1)* %p2
+}
+
+; Show the interaction of live-through spilling followed by live-in.
+define void @test6(i32 %a) gc "statepoint-example" {
+entry:
+; TODO: We could have reused the previous spill slot at zero additional cost.
+; CHECK-LABEL: test6
+; CHECK:        movl %edi, %ebx
+; CHECK:        movl %ebx, 12(%rsp)
+; CHECK-NEXT:   callq	_baz
+; CHECK-NEXT:  Ltmp30:
+; CHECK-NEXT:   callq	_bar
+  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @baz, i32 0, i32 0, i32 0, i32 1, i32 %a)
+  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 1, i32 %a)
+  ret void
+}
+
+
+; CHECK: Ltmp1-_test1
+; CHECK:      .byte	1
+; CHECK-NEXT: .byte	4
+; CHECK-NEXT: .short	5
+; CHECK-NEXT: .long	0
+
+; CHECK: Ltmp7-_test2
+; CHECK:      .byte	1
+; CHECK-NEXT: .byte	4
+; CHECK-NEXT: .short	6
+; CHECK-NEXT: .long	0
+; CHECK:      .byte	1
+; CHECK-NEXT: .byte	4
+; CHECK-NEXT: .short	3
+; CHECK-NEXT: .long	0
+; CHECK: Ltmp8-_test2
+; CHECK:      .byte	1
+; CHECK-NEXT: .byte	4
+; CHECK-NEXT: .short	3
+; CHECK-NEXT: .long	0
+; CHECK:      .byte	1
+; CHECK-NEXT: .byte	4
+; CHECK-NEXT: .short	6
+; CHECK-NEXT: .long	0
+
+declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)
+declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token, i32, i32)
+
+
+attributes #0 = { "deopt-lowering"="live-in" }
+attributes #1 = { "deopt-lowering"="live-through" }
Index: llvm/trunk/test/Transforms/RewriteStatepointsForGC/deopt-lowering-attrs.ll
===================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/deopt-lowering-attrs.ll
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/deopt-lowering-attrs.ll
@@ -0,0 +1,23 @@
+; RUN: opt -rewrite-statepoints-for-gc -S < %s | FileCheck %s
+; Check that the "deopt-lowering" function attribute gets transcoded into
+; flags on the resulting statepoint
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+declare void @foo()
+declare void @bar() "deopt-lowering"="live-in"
+declare void @baz() "deopt-lowering"="live-through"
+
+define void @test1() gc "statepoint-example" {
+; CHECK-LABEL: @test1(
+; CHECK: @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 1, i32 57)
+; CHECK: @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 1, i32 42)
+; CHECK: @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @baz, i32 0, i32 0, i32 0, i32 1, i32 13)
+
+entry:
+  call void @foo() [ "deopt"(i32 57) ]
+  call void @bar() [ "deopt"(i32 42) ]
+  call void @baz() [ "deopt"(i32 13) ]
+  ret void
+}