diff --git a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
--- a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
+++ b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
@@ -99,6 +99,11 @@
   using StatepointSpillMapTy = DenseMap<const Value *, Optional<int>>;
   DenseMap<const Instruction *, StatepointSpillMapTy> StatepointSpillMaps;
 
+  /// For each statepoint keep mapping from original derived pointer to
+  /// the index of Statepoint node result defining its new value.
+  using DerivedPtrMapTy = DenseMap<const Value *, unsigned>;
+  DenseMap<const Instruction *, DerivedPtrMapTy> DerivedPtrMap;
+
   /// StaticAllocaMap - Keep track of frame indices for fixed sized allocas in
   /// the entry block.  This allows the allocas to be efficiently referenced
   /// anywhere in the function.
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -82,6 +82,28 @@
   return N;
 }
 
+/// Return starting index of GC operand list.
+// FIXME: need a better place for this. Put it in StackMaps?
+static unsigned getStatepointGCArgStartIdx(MachineInstr *MI) {
+  assert(MI->getOpcode() == TargetOpcode::STATEPOINT &&
+         "STATEPOINT node expected");
+  unsigned OperIdx = StatepointOpers(MI).getNumDeoptArgsIdx();
+  unsigned NumDeopts = MI->getOperand(OperIdx).getImm();
+  // At this point stack references has not been lowered yet, so they
+  // take single operand.
+  ++OperIdx;
+  while (NumDeopts--) {
+    MachineOperand &MO = MI->getOperand(OperIdx);
+    if (MO.isImm() && MO.getImm() == StackMaps::ConstantOp) {
+      ++OperIdx;
+      assert(MI->getOperand(OperIdx).isImm() &&
+             "Unexpected statepoint operand");
+    }
+    ++OperIdx;
+  }
+  return OperIdx;
+}
+
 /// EmitCopyFromReg - Generate machine code for an CopyFromReg node or an
 /// implicit physical register output.
 void InstrEmitter::
@@ -200,6 +222,8 @@
   bool HasVRegVariadicDefs = !MF->getTarget().usesPhysRegsForValues() &&
                              II.isVariadic() && II.variadicOpsAreDefs();
   unsigned NumVRegs = HasVRegVariadicDefs ? NumResults : II.getNumDefs();
+  if (Node->getMachineOpcode() == TargetOpcode::STATEPOINT)
+    NumVRegs = NumResults;
   for (unsigned i = 0; i < NumVRegs; ++i) {
     // If the specific node value is only used by a CopyToReg and the dest reg
     // is a vreg in the same register class, use the CopyToReg'd destination
@@ -821,6 +845,8 @@
       NumDefs = NumResults;
     }
     ScratchRegs = TLI->getScratchRegisters((CallingConv::ID) CC);
+  } else if (Opc == TargetOpcode::STATEPOINT) {
+    NumDefs = NumResults;
   }
 
   unsigned NumImpUses = 0;
@@ -970,6 +996,20 @@
   if (!UsedRegs.empty() || II.getImplicitDefs() || II.hasOptionalDef())
     MIB->setPhysRegsDeadExcept(UsedRegs, *TRI);
 
+  // STATEPOINT is too 'dynamic' to have meaningful machine description.
+  // We have to manually tie operands.
+  if (Opc == TargetOpcode::STATEPOINT && NumDefs > 0) {
+    assert(!HasPhysRegOuts && "STATEPOINT mishandled");
+    MachineInstr *MI = MIB;
+    unsigned Def = 0;
+    unsigned Use = getStatepointGCArgStartIdx(MI) + 1;
+    while (Def < NumDefs) {
+      if (MI->getOperand(Use).isReg())
+        MI->tieOperands(Def++, Use);
+      Use += 2;
+    }
+  }
+
   // Run post-isel target hook to adjust this instruction if needed.
   if (II.hasPostISelHook())
     TLI->AdjustInstrPostInstrSelection(*MIB, Node);
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -125,8 +125,7 @@
     PhysReg = Reg;
   } else if (Def->isMachineOpcode()) {
     const MCInstrDesc &II = TII->get(Def->getMachineOpcode());
-    if (ResNo >= II.getNumDefs() &&
-        II.ImplicitDefs[ResNo - II.getNumDefs()] == Reg)
+    if (ResNo >= II.getNumDefs() && II.hasImplicitDefOfPhysReg(Reg))
       PhysReg = Reg;
   }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -67,6 +67,10 @@
     "use-registers-for-deopt-values", cl::Hidden, cl::init(false),
     cl::desc("Allow using registers for non pointer deopt args"));
 
+cl::opt<bool> UseRegistersForGCPointers(
+    "use-registers-for-gc-values", cl::Hidden, cl::init(false),
+    cl::desc("Allow using registers for GC pointer meta args"));
+
 static void pushStackMapConstant(SmallVectorImpl<SDValue>& Ops,
                                  SelectionDAGBuilder &Builder, uint64_t Value) {
   SDLoc L = Builder.getCurSDLoc();
@@ -221,6 +225,14 @@
   return None;
 }
 
+// Return true if V is a values which need not to be relocated/spilled.
+static bool isConstantVal(SDValue V) {
+  if (V.getValueSizeInBits() > 64)
+    return false;
+  return (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V) ||
+          isa<FrameIndexSDNode>(V) || V.isUndef());
+}
+
 /// Try to find existing copies of the incoming values in stack slots used for
 /// statepoint spilling.  If we can find a spill slot for the incoming value,
 /// mark that slot as allocated, and reuse the same slot for this safepoint.
@@ -230,12 +242,8 @@
                                              SelectionDAGBuilder &Builder) {
   SDValue Incoming = Builder.getValue(IncomingValue);
 
-  if (isa<ConstantSDNode>(Incoming) || isa<ConstantFPSDNode>(Incoming) ||
-      isa<FrameIndexSDNode>(Incoming) || Incoming.isUndef()) {
-    // We won't need to spill this, so no need to check for previously
-    // allocated stack slots
+  if (isConstantVal(Incoming))
     return;
-  }
 
   SDValue OldLocation = Builder.StatepointLowering.getLocation(Incoming);
   if (OldLocation.getNode())
@@ -368,7 +376,7 @@
                                  StoreMMO);
 
     MMO = getMachineMemOperand(MF, *cast<FrameIndexSDNode>(Loc));
-    
+
     Builder.StatepointLowering.setLocation(Incoming, Loc);
   }
 
@@ -430,6 +438,7 @@
     // of a late use so these values might be placed in registers which are
     // clobbered by the call.  This is fine for live-in. For live-through
     // fix-up pass should be executed to force spilling of such registers.
+    assert(!isConstantVal(Incoming) && "Unexpected SDValue");
     Ops.push_back(Incoming);
   } else {
     // Otherwise, locate a spill slot and explicitly spill it so it
@@ -457,7 +466,9 @@
 /// will be set to the last value spilled (if any were).
 static void
 lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
-                        SmallVectorImpl<MachineMemOperand*> &MemRefs,                                    SelectionDAGBuilder::StatepointLoweringInfo &SI,
+                        SmallVectorImpl<MachineMemOperand *> &MemRefs,
+                        unsigned &NumVRegGCArgs,
+                        SelectionDAGBuilder::StatepointLoweringInfo &SI,
                         SelectionDAGBuilder &Builder) {
   // Lower the deopt and gc arguments for this statepoint.  Layout will be:
   // deopt argument length, deopt arguments.., gc arguments...
@@ -514,7 +525,9 @@
   };
 
   auto requireSpillSlot = [&](const Value *V) {
-    return !(LiveInDeopt || UseRegistersForDeoptValues) || isGCValue(V);
+    if (isGCValue(V))
+      return !UseRegistersForGCPointers || V->getType()->isVectorTy();
+    return !(LiveInDeopt || UseRegistersForDeoptValues);
   };
 
   // Before we actually start lowering (and allocating spill slots for values),
@@ -526,10 +539,39 @@
     if (requireSpillSlot(V))
       reservePreviousStackSlotForValue(V, Builder);
   }
+
+  const size_t MaxTiedRegs = 15; // Max  number of tied regs MI can have.
+  unsigned MaxVRegPtrs = UseRegistersForGCPointers ? std::min(MaxTiedRegs, SI.Ptrs.size()) : 0;
+  unsigned CurNumVRegs = 0;
+
+  // Use old spill scheme for cross-block relocates.
+  if (SI.StatepointInstr) {
+    const BasicBlock *BB = SI.StatepointInstr->getParent();
+    bool NonLocalReloc = llvm::any_of(SI.GCRelocates,
+                                      [BB](const GCRelocateInst *R) {
+                                        return R->getParent() != BB;
+                                      });
+    if (NonLocalReloc)
+      MaxVRegPtrs = 0;
+  }
+
+  // Records which Ptr[] values are assigned to VRegs.
+  // We need this intermediate map because
+  //    SI.Ptrs.size() <= SI.GCRelocates.size()
+  // (due to derived pointers deduplication)
+  DenseMap<SDValue, unsigned> Ptr2Res;
+
   for (unsigned i = 0; i < SI.Bases.size(); ++i) {
+    SDValue SDV = Builder.getValue(SI.Ptrs[i]);
+    if (CurNumVRegs < MaxVRegPtrs && !isConstantVal(SDV)) {
+      Ptr2Res[SDV] = CurNumVRegs++;
+      continue;
+    }
+    Ptr2Res[SDV] = MaxTiedRegs;
     reservePreviousStackSlotForValue(SI.Bases[i], Builder);
     reservePreviousStackSlotForValue(SI.Ptrs[i], Builder);
   }
+  NumVRegGCArgs = CurNumVRegs;
 
   // First, prefix the list with the number of unique values to be
   // lowered.  Note that this is the number of *Values* not the
@@ -559,16 +601,24 @@
   // arrays interwoven with each (lowered) base pointer immediately followed by
   // it's (lowered) derived pointer.  i.e
   // (base[0], ptr[0], base[1], ptr[1], ...)
+  // Lower first `NumVRegGCArgs` base AND derived pointers through VRegs.
+  // In future we might use more sophisticated strategy for choosing which
+  // pointers to pass via virtual registers, but for now this simple approach
+  // looks good enough. Take into account these facts:
+  //  - NumVRegGCArgs is limited by the  max number of tied registers in MI;
+  //  - We relocate (and so need tied defs for) only derived pointers;
+  //  - Quite often base and derived pointer are the same.
+
   for (unsigned i = 0; i < SI.Bases.size(); ++i) {
-    const Value *Base = SI.Bases[i];
-    lowerIncomingStatepointValue(Builder.getValue(Base),
-                                 /*RequireSpillSlot*/ true, Ops, MemRefs,
-                                 Builder);
+    SDValue Derived = Builder.getValue(SI.Ptrs[i]);
+    assert(Ptr2Res.find(Derived) != Ptr2Res.end() && "Broken DerivedPtr map");
 
-    const Value *Ptr = SI.Ptrs[i];
-    lowerIncomingStatepointValue(Builder.getValue(Ptr),
-                                 /*RequireSpillSlot*/ true, Ops, MemRefs,
-                                 Builder);
+    bool RequireSpillSlot = (Ptr2Res[Derived] == MaxTiedRegs);
+    const Value *Base = SI.Bases[i];
+    lowerIncomingStatepointValue(Builder.getValue(Base), RequireSpillSlot, Ops,
+                                 MemRefs, Builder);
+    lowerIncomingStatepointValue(Derived, RequireSpillSlot, Ops,
+                                 MemRefs, Builder);
   }
 
   // If there are any explicit spill slots passed to the statepoint, record
@@ -596,13 +646,17 @@
   // values, while previous loops account only values with unique SDValues.
   const Instruction *StatepointInstr = SI.StatepointInstr;
   auto &SpillMap = Builder.FuncInfo.StatepointSpillMaps[StatepointInstr];
+  auto &DPtrMap = Builder.FuncInfo.DerivedPtrMap[StatepointInstr];
 
   for (const GCRelocateInst *Relocate : SI.GCRelocates) {
     const Value *V = Relocate->getDerivedPtr();
     SDValue SDV = Builder.getValue(V);
     SDValue Loc = Builder.StatepointLowering.getLocation(SDV);
 
-    if (Loc.getNode()) {
+    if (Ptr2Res[SDV] < MaxTiedRegs) {
+      DPtrMap[V] = Ptr2Res[SDV];
+      SpillMap[V] = None;
+    } else if (Loc.getNode()) {
       SpillMap[V] = cast<FrameIndexSDNode>(Loc)->getIndex();
     } else {
       // Record value as visited, but not spilled. This is case for allocas
@@ -646,7 +700,11 @@
   // Lower statepoint vmstate and gcstate arguments
   SmallVector<SDValue, 10> LoweredMetaArgs;
   SmallVector<MachineMemOperand*, 16> MemRefs;
-  lowerStatepointMetaArgs(LoweredMetaArgs, MemRefs, SI, *this);
+  unsigned NumVRegs = 0;
+  lowerStatepointMetaArgs(LoweredMetaArgs, MemRefs, NumVRegs, SI, *this);
+
+
+  LLVM_DEBUG(dbgs() << "NumVRegs = " << NumVRegs << "\n");
 
   // Now that we've emitted the spills, we need to update the root so that the
   // call sequence is ordered correctly.
@@ -760,8 +818,13 @@
 
   // Compute return values.  Provide a glue output since we consume one as
   // input.  This allows someone else to chain off us as needed.
-  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SmallVector<EVT, 8> NodeTys;
+  for (unsigned i = 0; i < NumVRegs; ++i)
+    NodeTys.push_back(getValue(SI.Ptrs[i]).getValueType());
+  NodeTys.push_back(MVT::Other);
+  NodeTys.push_back(MVT::Glue);
 
+  unsigned NumResults = NodeTys.size();
   MachineSDNode *StatepointMCNode =
     DAG.getMachineNode(TargetOpcode::STATEPOINT, getCurSDLoc(), NodeTys, Ops);
   DAG.setNodeMemRefs(StatepointMCNode, MemRefs);
@@ -776,7 +839,7 @@
     SmallVector<SDValue, 8> TEOps;
 
     // Add chain
-    TEOps.push_back(SDValue(StatepointMCNode, 0));
+    TEOps.push_back(SDValue(StatepointMCNode, NumResults - 2));
 
     // Add GC transition arguments
     for (const Value *V : SI.GCTransitionArgs) {
@@ -786,7 +849,7 @@
     }
 
     // Add glue
-    TEOps.push_back(SDValue(StatepointMCNode, 1));
+    TEOps.push_back(SDValue(StatepointMCNode, NumResults - 1));
 
     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
 
@@ -797,7 +860,12 @@
   }
 
   // Replace original call
-  DAG.ReplaceAllUsesWith(CallNode, SinkNode); // This may update Root
+  // Call: ch,glue = CALL ...
+  // Statepoint: [gc relocates],ch,glue = STATEPOINT ...
+  unsigned NumSinkValues = SinkNode->getNumValues();
+  SDValue StatepointValues[2] = {SDValue(SinkNode, NumSinkValues - 2),
+                                 SDValue(SinkNode, NumSinkValues - 1)};
+  DAG.ReplaceAllUsesWith(CallNode, StatepointValues);
   // Remove original call node
   DAG.DeleteNode(CallNode);
 
@@ -810,7 +878,12 @@
   // previously emitted STATEPOINT value.  Unfortunately, this doesn't appear
   // to actually be possible today.
 
-  return ReturnVal;
+  // SDValue must have type to be used as MERGE_VALUES operand. Use void UNDEF
+  // as a placeholder for void functions.
+  if (!ReturnVal)
+    ReturnVal = DAG.getUNDEF(MVT::isVoid);
+  return DAG.getMergeValues({ReturnVal, SDValue(StatepointMCNode, 0)},
+                            getCurSDLoc());
 }
 
 void
@@ -880,7 +953,18 @@
   SI.NumPatchBytes = I.getNumPatchBytes();
   SI.EHPadBB = EHPadBB;
 
-  SDValue ReturnValue = LowerAsSTATEPOINT(SI);
+  SDValue Merge = LowerAsSTATEPOINT(SI);
+  assert(Merge->getOpcode() == ISD::MERGE_VALUES);
+  setValue(&I, Merge);
+
+  const BasicBlock *BB = I.getParent();
+
+  // In case of non-local relocates all GC args must be spilled.
+  // Two remaining values are chain and glue.
+  std::vector<const GCRelocateInst *> RV = I.getGCRelocates();
+  bool NeedExport = llvm::any_of(
+      RV, [&BB](const GCRelocateInst *R) { return R->getParent() != BB; });
+  assert(!NeedExport || Merge.getOperand(1)->getNumValues() == 2);
 
   // Export the result value if needed
   const GCResultInst *GCResult = I.getGCResult();
@@ -895,6 +979,7 @@
       // with correct type manually.
       // TODO: To eliminate this problem we can remove gc.result intrinsics
       //       completely and make statepoint call to return a tuple.
+      SDValue ReturnValue = Merge->getOperand(0);
       unsigned Reg = FuncInfo.CreateRegs(RetTy);
       RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
                        DAG.getDataLayout(), Reg, RetTy,
@@ -904,16 +989,7 @@
       RFV.getCopyToRegs(ReturnValue, DAG, getCurSDLoc(), Chain, nullptr);
       PendingExports.push_back(Chain);
       FuncInfo.ValueMap[&I] = Reg;
-    } else {
-      // Result value will be used in a same basic block. Don't export it or
-      // perform any explicit register copies.
-      // We'll replace the actuall call node shortly. gc_result will grab
-      // this value.
-      setValue(&I, ReturnValue);
     }
-  } else {
-    // The token value is never used from here on, just generate a poison value
-    setValue(&I, DAG.getIntPtrConstant(-1, getCurSDLoc()));
   }
 }
 
@@ -944,7 +1020,9 @@
 
   // NB! The GC arguments are deliberately left empty.
 
-  if (SDValue ReturnVal = LowerAsSTATEPOINT(SI)) {
+  auto Ret = LowerAsSTATEPOINT(SI);
+  assert(Ret->getOpcode() == ISD::MERGE_VALUES);
+  if (SDValue ReturnVal = Ret.getOperand(0)) {
     ReturnVal = lowerRangeToAssertZExt(DAG, *Call, ReturnVal);
     setValue(Call, ReturnVal);
   }
@@ -975,17 +1053,21 @@
     assert(CopyFromReg.getNode());
     setValue(&CI, CopyFromReg);
   } else {
-    setValue(&CI, getValue(I));
+    SDValue SD = getValue(I);
+    if (SD->getOpcode() == ISD::MERGE_VALUES)
+      SD = SD->getOperand(0);
+    setValue(&CI, SD);
   }
 }
 
 void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) {
+  const BasicBlock *StatepointBB = Relocate.getStatepoint()->getParent();
 #ifndef NDEBUG
   // Consistency check
   // We skip this check for relocates not in the same basic block as their
   // statepoint. It would be too expensive to preserve validation info through
   // different basic blocks.
-  if (Relocate.getStatepoint()->getParent() == Relocate.getParent())
+  if (StatepointBB == Relocate.getParent())
     StatepointLowering.relocCallVisited(Relocate);
 
   auto *Ty = Relocate.getType()->getScalarType();
@@ -1008,6 +1090,23 @@
   assert(SlotIt != SpillMap.end() && "Relocating not lowered gc value");
   Optional<int> DerivedPtrLocation = SlotIt->second;
 
+  auto &DPtrMap = FuncInfo.DerivedPtrMap[Relocate.getStatepoint()];
+  auto It = DPtrMap.find(Relocate.getDerivedPtr());
+  if (It != DPtrMap.end()) {
+    // This GC ptr is lowered through VReg.
+    unsigned Index = It->second;
+    SDValue Result;
+    if (StatepointBB != Relocate.getParent()) {
+      llvm_unreachable("VReg for non-local relocates encountered");
+    }
+    SDNode *Statepoint = getValue(Relocate.getStatepoint()).getNode();
+    if (Statepoint->getOpcode() == ISD::MERGE_VALUES)
+      Statepoint = Statepoint->getOperand(1).getNode();
+    Result = SDValue(Statepoint, Index);
+    setValue(&Relocate, Result);
+    return;
+  }
+
   // We didn't need to spill these special cases (constants and allocas).
   // See the handling in spillIncomingValueForStatepoint for detail.
   if (!DerivedPtrLocation) {
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1041,9 +1041,15 @@
   // Inherit previous memory operands.
   MIB.cloneMemRefs(*MI);
 
-  for (auto &MO : MI->operands()) {
+  for (unsigned i = 0; i < MI->getNumOperands(); ++i) {
+    MachineOperand &MO = MI->getOperand(i);
     if (!MO.isFI()) {
+      unsigned TiedTo = i;
+      if (MO.isReg() && MO.isTied())
+        TiedTo = MI->findTiedOperandIdx(i);
       MIB.add(MO);
+      if (TiedTo < i)
+        MIB->tieOperands(TiedTo, MIB->getNumOperands() - 1);
       continue;
     }
 
diff --git a/llvm/test/CodeGen/X86/statepoint-call-lowering.ll b/llvm/test/CodeGen/X86/statepoint-call-lowering.ll
--- a/llvm/test/CodeGen/X86/statepoint-call-lowering.ll
+++ b/llvm/test/CodeGen/X86/statepoint-call-lowering.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,CHECK-ALL %s
+; RUN: llc -verify-machineinstrs -use-registers-for-gc-values=true < %s | FileCheck --check-prefixes=CHECK-VREG,CHECK-ALL %s
 ; This file contains a collection of basic tests to ensure we didn't
 ; screw up normal call lowering when there are no deopt or gc arguments.
 
@@ -16,15 +17,15 @@
 declare void @varargf(i32, ...)
 
 define i1 @test_i1_return() gc "statepoint-example" {
-; CHECK-LABEL: test_i1_return:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq return_i1
-; CHECK-NEXT:  .Ltmp0:
-; CHECK-NEXT:    popq %rcx
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+; CHECK-ALL-LABEL: test_i1_return:
+; CHECK-ALL:       # %bb.0: # %entry
+; CHECK-ALL-NEXT:    pushq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ALL-NEXT:    callq return_i1
+; CHECK-ALL-NEXT:  .Ltmp0:
+; CHECK-ALL-NEXT:    popq %rcx
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-ALL-NEXT:    retq
 ; This is just checking that a i1 gets lowered normally when there's no extra
 ; state arguments to the statepoint
 entry:
@@ -34,15 +35,15 @@
 }
 
 define i32 @test_i32_return() gc "statepoint-example" {
-; CHECK-LABEL: test_i32_return:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq return_i32
-; CHECK-NEXT:  .Ltmp1:
-; CHECK-NEXT:    popq %rcx
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+; CHECK-ALL-LABEL: test_i32_return:
+; CHECK-ALL:       # %bb.0: # %entry
+; CHECK-ALL-NEXT:    pushq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ALL-NEXT:    callq return_i32
+; CHECK-ALL-NEXT:  .Ltmp1:
+; CHECK-ALL-NEXT:    popq %rcx
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-ALL-NEXT:    retq
 entry:
   %safepoint_token = tail call token (i64, i32, i32 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32f(i64 0, i32 0, i32 ()* @return_i32, i32 0, i32 0, i32 0, i32 0)
   %call1 = call zeroext i32 @llvm.experimental.gc.result.i32(token %safepoint_token)
@@ -50,15 +51,15 @@
 }
 
 define i32* @test_i32ptr_return() gc "statepoint-example" {
-; CHECK-LABEL: test_i32ptr_return:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq return_i32ptr
-; CHECK-NEXT:  .Ltmp2:
-; CHECK-NEXT:    popq %rcx
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+; CHECK-ALL-LABEL: test_i32ptr_return:
+; CHECK-ALL:       # %bb.0: # %entry
+; CHECK-ALL-NEXT:    pushq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ALL-NEXT:    callq return_i32ptr
+; CHECK-ALL-NEXT:  .Ltmp2:
+; CHECK-ALL-NEXT:    popq %rcx
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-ALL-NEXT:    retq
 entry:
   %safepoint_token = tail call token (i64, i32, i32* ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_p0i32f(i64 0, i32 0, i32* ()* @return_i32ptr, i32 0, i32 0, i32 0, i32 0)
   %call1 = call i32* @llvm.experimental.gc.result.p0i32(token %safepoint_token)
@@ -66,15 +67,15 @@
 }
 
 define float @test_float_return() gc "statepoint-example" {
-; CHECK-LABEL: test_float_return:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq return_float
-; CHECK-NEXT:  .Ltmp3:
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+; CHECK-ALL-LABEL: test_float_return:
+; CHECK-ALL:       # %bb.0: # %entry
+; CHECK-ALL-NEXT:    pushq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ALL-NEXT:    callq return_float
+; CHECK-ALL-NEXT:  .Ltmp3:
+; CHECK-ALL-NEXT:    popq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-ALL-NEXT:    retq
 entry:
   %safepoint_token = tail call token (i64, i32, float ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_f32f(i64 0, i32 0, float ()* @return_float, i32 0, i32 0, i32 0, i32 0)
   %call1 = call float @llvm.experimental.gc.result.f32(token %safepoint_token)
@@ -82,15 +83,15 @@
 }
 
 define %struct @test_struct_return() gc "statepoint-example" {
-; CHECK-LABEL: test_struct_return:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq return_struct
-; CHECK-NEXT:  .Ltmp4:
-; CHECK-NEXT:    popq %rcx
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+; CHECK-ALL-LABEL: test_struct_return:
+; CHECK-ALL:       # %bb.0: # %entry
+; CHECK-ALL-NEXT:    pushq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ALL-NEXT:    callq return_struct
+; CHECK-ALL-NEXT:  .Ltmp4:
+; CHECK-ALL-NEXT:    popq %rcx
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-ALL-NEXT:    retq
 entry:
   %safepoint_token = tail call token (i64, i32, %struct ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_structf(i64 0, i32 0, %struct ()* @return_struct, i32 0, i32 0, i32 0, i32 0)
   %call1 = call %struct @llvm.experimental.gc.result.struct(token %safepoint_token)
@@ -108,6 +109,22 @@
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
+; CHECK-VREG-LABEL: test_relocate:
+; CHECK-VREG:       # %bb.0: # %entry
+; CHECK-VREG-NEXT:    pushq %rbx
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-VREG-NEXT:    subq $16, %rsp
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-VREG-NEXT:    .cfi_offset %rbx, -16
+; CHECK-VREG-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p)
+; CHECK-VREG-NEXT:    callq return_i1
+; CHECK-VREG-NEXT:  .Ltmp5:
+; CHECK-VREG-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx
+; CHECK-VREG-NEXT:    addq $16, %rsp
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-VREG-NEXT:    popq %rbx
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-VREG-NEXT:    retq
 ; Check that an ununsed relocate has no code-generation impact
 entry:
   %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0) ["gc-live" (i32 addrspace(1)* %a)]
@@ -117,17 +134,17 @@
 }
 
 define void @test_void_vararg() gc "statepoint-example" {
-; CHECK-LABEL: test_void_vararg:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movl $42, %edi
-; CHECK-NEXT:    movl $43, %esi
-; CHECK-NEXT:    callq varargf
-; CHECK-NEXT:  .Ltmp6:
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+; CHECK-ALL-LABEL: test_void_vararg:
+; CHECK-ALL:       # %bb.0: # %entry
+; CHECK-ALL-NEXT:    pushq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ALL-NEXT:    movl $42, %edi
+; CHECK-ALL-NEXT:    movl $43, %esi
+; CHECK-ALL-NEXT:    callq varargf
+; CHECK-ALL-NEXT:  .Ltmp6:
+; CHECK-ALL-NEXT:    popq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-ALL-NEXT:    retq
 ; Check a statepoint wrapping a *void* returning vararg function works
 entry:
   %safepoint_token = tail call token (i64, i32, void (i32, ...)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidi32varargf(i64 0, i32 0, void (i32, ...)* @varargf, i32 2, i32 0, i32 42, i32 43, i32 0, i32 0)
@@ -137,15 +154,15 @@
 }
 
 define i1 @test_i1_return_patchable() gc "statepoint-example" {
-; CHECK-LABEL: test_i1_return_patchable:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    nopl (%rax)
-; CHECK-NEXT:  .Ltmp7:
-; CHECK-NEXT:    popq %rcx
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+; CHECK-ALL-LABEL: test_i1_return_patchable:
+; CHECK-ALL:       # %bb.0: # %entry
+; CHECK-ALL-NEXT:    pushq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ALL-NEXT:    nopl (%rax)
+; CHECK-ALL-NEXT:  .Ltmp7:
+; CHECK-ALL-NEXT:    popq %rcx
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-ALL-NEXT:    retq
 ; A patchable variant of test_i1_return
 entry:
   %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 3, i1 ()*null, i32 0, i32 0, i32 0, i32 0)
@@ -156,38 +173,38 @@
 declare void @consume(i32 addrspace(1)* %obj)
 
 define i1 @test_cross_bb(i32 addrspace(1)* %a, i1 %external_cond) gc "statepoint-example" {
-; CHECK-LABEL: test_cross_bb:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rbp
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset %rbx, -24
-; CHECK-NEXT:    .cfi_offset %rbp, -16
-; CHECK-NEXT:    movl %esi, %ebp
-; CHECK-NEXT:    movq %rdi, (%rsp)
-; CHECK-NEXT:    callq return_i1
-; CHECK-NEXT:  .Ltmp8:
-; CHECK-NEXT:    testb $1, %bpl
-; CHECK-NEXT:    je .LBB8_2
-; CHECK-NEXT:  # %bb.1: # %left
-; CHECK-NEXT:    movl %eax, %ebx
-; CHECK-NEXT:    movq (%rsp), %rdi
-; CHECK-NEXT:    callq consume
-; CHECK-NEXT:    movl %ebx, %eax
-; CHECK-NEXT:    jmp .LBB8_3
-; CHECK-NEXT:  .LBB8_2: # %right
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:  .LBB8_3: # %right
-; CHECK-NEXT:    addq $8, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    popq %rbp
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+; CHECK-ALL-LABEL: test_cross_bb:
+; CHECK-ALL:       # %bb.0: # %entry
+; CHECK-ALL-NEXT:    pushq %rbp
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ALL-NEXT:    pushq %rbx
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-ALL-NEXT:    pushq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-ALL-NEXT:    .cfi_offset %rbx, -24
+; CHECK-ALL-NEXT:    .cfi_offset %rbp, -16
+; CHECK-ALL-NEXT:    movl %esi, %ebp
+; CHECK-ALL-NEXT:    movq %rdi, (%rsp)
+; CHECK-ALL-NEXT:    callq return_i1
+; CHECK-ALL-NEXT:  .Ltmp8:
+; CHECK-ALL-NEXT:    testb $1, %bpl
+; CHECK-ALL-NEXT:    je .LBB8_2
+; CHECK-ALL-NEXT:  # %bb.1: # %left
+; CHECK-ALL-NEXT:    movl %eax, %ebx
+; CHECK-ALL-NEXT:    movq (%rsp), %rdi
+; CHECK-ALL-NEXT:    callq consume
+; CHECK-ALL-NEXT:    movl %ebx, %eax
+; CHECK-ALL-NEXT:    jmp .LBB8_3
+; CHECK-ALL-NEXT:  .LBB8_2: # %right
+; CHECK-ALL-NEXT:    movb $1, %al
+; CHECK-ALL-NEXT:  .LBB8_3: # %right
+; CHECK-ALL-NEXT:    addq $8, %rsp
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-ALL-NEXT:    popq %rbx
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ALL-NEXT:    popq %rbp
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-ALL-NEXT:    retq
 entry:
   %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0) ["gc-live" (i32 addrspace(1)* %a)]
   br i1 %external_cond, label %left, label %right
@@ -207,31 +224,31 @@
 declare void @consume_attributes(i32, i8* nest, i32, %struct2* byval)
 
 define void @test_attributes(%struct2* byval %s) gc "statepoint-example" {
-; CHECK-LABEL: test_attributes:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    subq $8, %rsp
-; CHECK-NEXT:    .cfi_adjust_cfa_offset 8
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; CHECK-NEXT:    movl $42, %edi
-; CHECK-NEXT:    xorl %r10d, %r10d
-; CHECK-NEXT:    movl $17, %esi
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    .cfi_adjust_cfa_offset 8
-; CHECK-NEXT:    pushq %rdx
-; CHECK-NEXT:    .cfi_adjust_cfa_offset 8
-; CHECK-NEXT:    pushq %rcx
-; CHECK-NEXT:    .cfi_adjust_cfa_offset 8
-; CHECK-NEXT:    callq consume_attributes
-; CHECK-NEXT:  .Ltmp9:
-; CHECK-NEXT:    addq $32, %rsp
-; CHECK-NEXT:    .cfi_adjust_cfa_offset -32
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+; CHECK-ALL-LABEL: test_attributes:
+; CHECK-ALL:       # %bb.0: # %entry
+; CHECK-ALL-NEXT:    pushq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ALL-NEXT:    subq $8, %rsp
+; CHECK-ALL-NEXT:    .cfi_adjust_cfa_offset 8
+; CHECK-ALL-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; CHECK-ALL-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; CHECK-ALL-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; CHECK-ALL-NEXT:    movl $42, %edi
+; CHECK-ALL-NEXT:    xorl %r10d, %r10d
+; CHECK-ALL-NEXT:    movl $17, %esi
+; CHECK-ALL-NEXT:    pushq %rax
+; CHECK-ALL-NEXT:    .cfi_adjust_cfa_offset 8
+; CHECK-ALL-NEXT:    pushq %rdx
+; CHECK-ALL-NEXT:    .cfi_adjust_cfa_offset 8
+; CHECK-ALL-NEXT:    pushq %rcx
+; CHECK-ALL-NEXT:    .cfi_adjust_cfa_offset 8
+; CHECK-ALL-NEXT:    callq consume_attributes
+; CHECK-ALL-NEXT:  .Ltmp9:
+; CHECK-ALL-NEXT:    addq $32, %rsp
+; CHECK-ALL-NEXT:    .cfi_adjust_cfa_offset -32
+; CHECK-ALL-NEXT:    popq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-ALL-NEXT:    retq
 entry:
 ; Check that arguments with attributes are lowered correctly.
 ; We call a function that has a nest argument and a byval argument.
diff --git a/llvm/test/CodeGen/X86/statepoint-duplicates-export.ll b/llvm/test/CodeGen/X86/statepoint-duplicates-export.ll
--- a/llvm/test/CodeGen/X86/statepoint-duplicates-export.ll
+++ b/llvm/test/CodeGen/X86/statepoint-duplicates-export.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc  -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc  -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,CHECK-ALL %s
+; RUN: llc  -verify-machineinstrs -use-registers-for-gc-values=true < %s | FileCheck --check-prefixes=CHECK-VREG,CHECK-ALL %s
 
 ; Check that we can export values of "duplicated" gc.relocates without a crash
 ; "duplicate" here means maps to same SDValue.  We previously had an
@@ -12,18 +13,18 @@
 declare void @func()
 
 define i1 @test() gc "statepoint-example" {
-; CHECK-LABEL: test:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq func
-; CHECK-NEXT:  .Ltmp0:
-; CHECK-NEXT:    callq func
-; CHECK-NEXT:  .Ltmp1:
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    popq %rcx
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+; CHECK-ALL-LABEL: test:
+; CHECK-ALL:       # %bb.0: # %entry
+; CHECK-ALL-NEXT:    pushq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ALL-NEXT:    callq func
+; CHECK-ALL-NEXT:  .Ltmp0:
+; CHECK-ALL-NEXT:    callq func
+; CHECK-ALL-NEXT:  .Ltmp1:
+; CHECK-ALL-NEXT:    movb $1, %al
+; CHECK-ALL-NEXT:    popq %rcx
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-ALL-NEXT:    retq
 entry:
   %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @func, i32 0, i32 0, i32 0, i32 0) ["gc-live" (i32 addrspace(1)* null, i32 addrspace(1)* null)]
   %base = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token,  i32 0, i32 0)
diff --git a/llvm/test/CodeGen/X86/statepoint-invoke.ll b/llvm/test/CodeGen/X86/statepoint-invoke.ll
--- a/llvm/test/CodeGen/X86/statepoint-invoke.ll
+++ b/llvm/test/CodeGen/X86/statepoint-invoke.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs < %s 2>&1 | FileCheck %s
+; RUN: llc -verify-machineinstrs -use-registers-for-gc-values=true < %s | FileCheck %s
 
 target triple = "x86_64-pc-linux-gnu"
 
diff --git a/llvm/test/CodeGen/X86/statepoint-no-extra-const.ll b/llvm/test/CodeGen/X86/statepoint-no-extra-const.ll
--- a/llvm/test/CodeGen/X86/statepoint-no-extra-const.ll
+++ b/llvm/test/CodeGen/X86/statepoint-no-extra-const.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -use-registers-for-gc-values=true | FileCheck --check-prefix=CHECK-VREG %s
 
 define i8 addrspace(1)* @no_extra_const(i8 addrspace(1)* %obj) gc "statepoint-example" {
 ; CHECK-LABEL:   no_extra_const:
@@ -13,6 +14,23 @@
 ; CHECK-NEXT:    popq	%rcx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
+; CHECK-VREG-LABEL: no_extra_const:
+; CHECK-VREG:       # %bb.0: # %entry
+; CHECK-VREG-NEXT:    pushq %rbx
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-VREG-NEXT:    subq $16, %rsp
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-VREG-NEXT:    .cfi_offset %rbx, -16
+; CHECK-VREG-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p)
+; CHECK-VREG-NEXT:    nopl 8(%rax)
+; CHECK-VREG-NEXT:  .Ltmp0:
+; CHECK-VREG-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx
+; CHECK-VREG-NEXT:    movq %rbx, %rax
+; CHECK-VREG-NEXT:    addq $16, %rsp
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-VREG-NEXT:    popq %rbx
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-VREG-NEXT:    retq
 entry:
   %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 4, void ()* null, i32 0, i32 0, i32 0, i32 0) ["gc-live" (i8 addrspace(1)* %obj)]
   %obj.relocated = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %safepoint_token, i32 0, i32 0) ; (%obj, %obj)
diff --git a/llvm/test/CodeGen/X86/statepoint-regs.ll b/llvm/test/CodeGen/X86/statepoint-regs.ll
--- a/llvm/test/CodeGen/X86/statepoint-regs.ll
+++ b/llvm/test/CodeGen/X86/statepoint-regs.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -verify-machineinstrs -O3 -use-registers-for-deopt-values -restrict-statepoint-remat=true < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -O3 -use-registers-for-deopt-values -restrict-statepoint-remat=true < %s | FileCheck --check-prefixes=CHECK,CHECK-SPILL %s
+; RUN: llc -verify-machineinstrs -O3 -use-registers-for-deopt-values -restrict-statepoint-remat=true -use-registers-for-gc-values=true < %s | FileCheck --check-prefixes=CHECK,CHECK-VREG %s
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.11.0"
 
@@ -97,23 +98,49 @@
 
 ; A gc-value must be spilled even if it is also a deopt value.
 define  i32 addrspace(1)* @test5(i32 %a, i32 addrspace(1)* %p) gc "statepoint-example" {
-; CHECK-LABEL: test5:
-; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    subq $16, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset %rbx, -16
-; CHECK-NEXT:    movl %edi, %ebx
-; CHECK-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    callq _bar
-; CHECK-NEXT:  Ltmp5:
-; CHECK-NEXT:    callq _bar
-; CHECK-NEXT:  Ltmp6:
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    addq $16, %rsp
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    retq
+; CHECK-SPILL-LABEL: test5:
+; CHECK-SPILL:       ## %bb.0: ## %entry
+; CHECK-SPILL-NEXT:    pushq %rbx
+; CHECK-SPILL-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SPILL-NEXT:    subq $16, %rsp
+; CHECK-SPILL-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SPILL-NEXT:    .cfi_offset %rbx, -16
+; CHECK-SPILL-NEXT:    movl %edi, %ebx
+; CHECK-SPILL-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
+; CHECK-SPILL-NEXT:    callq _bar
+; CHECK-SPILL-NEXT:  Ltmp5:
+; CHECK-SPILL-NEXT:    callq _bar
+; CHECK-SPILL-NEXT:  Ltmp6:
+; CHECK-SPILL-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; CHECK-SPILL-NEXT:    addq $16, %rsp
+; CHECK-SPILL-NEXT:    popq %rbx
+; CHECK-SPILL-NEXT:    retq
+;
+; CHECK-VREG-LABEL: test5:
+; CHECK-VREG:       ## %bb.0: ## %entry
+; CHECK-VREG-NEXT:    pushq %rbp
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-VREG-NEXT:    pushq %rbx
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-VREG-NEXT:    pushq %rax
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-VREG-NEXT:    .cfi_offset %rbx, -24
+; CHECK-VREG-NEXT:    .cfi_offset %rbp, -16
+; CHECK-VREG-NEXT:    movq %rsi, (%rsp)
+; CHECK-VREG-NEXT:    movl %edi, %ebp
+; CHECK-VREG-NEXT:    callq _bar
+; CHECK-VREG-NEXT:  Ltmp5:
+; CHECK-VREG-NEXT:    movq (%rsp), %rbx
+; CHECK-VREG-NEXT:    movq %rbx, (%rsp)
+; CHECK-VREG-NEXT:    callq _bar
+; CHECK-VREG-NEXT:  Ltmp6:
+; CHECK-VREG-NEXT:    movq (%rsp), %rbx
+; CHECK-VREG-NEXT:    movq %rbx, %rax
+; CHECK-VREG-NEXT:    addq $8, %rsp
+; CHECK-VREG-NEXT:    popq %rbx
+; CHECK-VREG-NEXT:    popq %rbp
+; CHECK-VREG-NEXT:    retq
+
 entry:
   %token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 0) ["gc-live" (i32 addrspace(1)* %p, i32 addrspace(1)* %p), "deopt"(i32 %a)]
   %p2 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %token,  i32 1, i32 1)
@@ -672,30 +699,27 @@
   ret void
 }
 
-define i32 addrspace(1)*  @test_fpconst_deopt(i32 addrspace(1)* %in) gc "statepoint-example" {
+define void @test_fpconst_deopt(i32 addrspace(1)* %in) gc "statepoint-example" {
 ; CHECK-LABEL: test_fpconst_deopt:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movq %rdi, (%rsp)
 ; CHECK-NEXT:    nopl 8(%rax,%rax)
 ; CHECK-NEXT:  Ltmp18:
-; CHECK-NEXT:    movq (%rsp), %rax
-; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
-    %statepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2, i32 5, void ()* nonnull @bar, i32 0, i32 0, i32 0, i32 0) ["gc-live" (i32 addrspace(1)* %in), "deopt" (
+    %statepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2, i32 5, void ()* nonnull @bar, i32 0, i32 0, i32 0, i32 0) ["deopt" (
     float 0x40421A1CA0000000, float 0x40459A1CA0000000, float 0x40401A1CA0000000, float 0x40479A1CA0000000, float 0x403C343940000000,
     float 0x403E343940000000, float 0x40469A1CA0000000, float 0x40489A1CA0000000, float 0x404A9A1CA0000000, float 0x40499A1CA0000000,
     float 0xC05FCD2F20000000, float 0xC05C0D2F20000000, float 0xC060269780000000, float 0xC05B8D2F20000000, float 0xC060669780000000,
     float 0xC05B0D2F20000000, float 0xC060A69780000000, float 0xC05A8D2F20000000, float 0xC060E69780000000, float 0x40439A1CA0000000)]
-    %out = call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %statepoint_token, i32 0, i32 0)
-    ret i32 addrspace(1)* %out
+    ret void
 }
 
 ; CHECK-LABEL: __LLVM_StackMaps:
 ; CHECK: .long   Ltmp18-_test_fpconst_deopt
 ; CHECK-NEXT: .short	0
-; CHECK-NEXT: .short	25
+; CHECK-NEXT: .short	23
 ; CHECK-NEXT: .byte	4
 ; CHECK-NEXT: .byte	0
 ; CHECK-NEXT: .short	8
diff --git a/llvm/test/CodeGen/X86/statepoint-uniqueing.ll b/llvm/test/CodeGen/X86/statepoint-uniqueing.ll
--- a/llvm/test/CodeGen/X86/statepoint-uniqueing.ll
+++ b/llvm/test/CodeGen/X86/statepoint-uniqueing.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK-SPILL,CHECK %s
+; RUN: llc -verify-machineinstrs -use-registers-for-gc-values=true < %s | FileCheck --check-prefixes=CHECK-VREG,CHECK %s
 
 target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-pc-linux-gnu"
@@ -12,20 +13,41 @@
 
 ;; Two gc.relocates of the same input, should require only a single spill/fill
 define void @test_gcrelocate_uniqueing(i32 addrspace(1)* %ptr) gc "statepoint-example" {
-; CHECK-LABEL: test_gcrelocate_uniqueing:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movq %rdi, (%rsp)
-; CHECK-NEXT:    callq f
-; CHECK-NEXT:  .Ltmp0:
-; CHECK-NEXT:    movq (%rsp), %rdi
-; CHECK-NEXT:    movq %rdi, %rsi
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    callq use
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+; CHECK-ALL-LABEL: test_gcrelocate_uniqueing:
+; CHECK-ALL:       # %bb.0:
+; CHECK-ALL-NEXT:    pushq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ALL-NEXT:    movq %rdi, (%rsp)
+; CHECK-ALL-NEXT:    callq f
+; CHECK-ALL-NEXT:  .Ltmp0:
+; CHECK-ALL-NEXT:    movq (%rsp), %rdi
+; CHECK-ALL-NEXT:    movq %rdi, %rsi
+; CHECK-ALL-NEXT:    xorl %eax, %eax
+; CHECK-ALL-NEXT:    callq use
+; CHECK-ALL-NEXT:    popq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-ALL-NEXT:    retq
+
+; CHECK-VREG-LABEL: test_gcrelocate_uniqueing:
+; CHECK-VREG:       # %bb.0:
+; CHECK-VREG-NEXT:    pushq %rbx
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-VREG-NEXT:    subq $16, %rsp
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-VREG-NEXT:    .cfi_offset %rbx, -16
+; CHECK-VREG-NEXT:    movq %rdi, 8(%rsp)
+; CHECK-VREG-NEXT:    callq f
+; CHECK-VREG-NEXT:  .Ltmp0:
+; CHECK-VREG-NEXT:    movq 8(%rsp), %rbx
+; CHECK-VREG-NEXT:    movq %rbx, %rdi
+; CHECK-VREG-NEXT:    movq %rbx, %rsi
+; CHECK-VREG-NEXT:    xorl %eax, %eax
+; CHECK-VREG-NEXT:    callq use
+; CHECK-VREG-NEXT:    addq $16, %rsp
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-VREG-NEXT:    popq %rbx
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-VREG-NEXT:    retq
   %tok = tail call token (i64, i32, void ()*, i32, i32, ...)
       @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @f, i32 0, i32 0, i32 0, i32 0) ["gc-live" (i32 addrspace(1)* %ptr, i32 addrspace(1)* %ptr), "deopt" (i32 addrspace(1)* %ptr, i32 undef)]
   %a = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %tok, i32 0, i32 0)
@@ -36,20 +58,41 @@
 
 ;; Two gc.relocates of a bitcasted pointer should only require a single spill/fill
 define void @test_gcptr_uniqueing(i32 addrspace(1)* %ptr) gc "statepoint-example" {
-; CHECK-LABEL: test_gcptr_uniqueing:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movq %rdi, (%rsp)
-; CHECK-NEXT:    callq f
-; CHECK-NEXT:  .Ltmp1:
-; CHECK-NEXT:    movq (%rsp), %rdi
-; CHECK-NEXT:    movq %rdi, %rsi
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    callq use
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+; CHECK-ALL-LABEL: test_gcptr_uniqueing:
+; CHECK-ALL:       # %bb.0:
+; CHECK-ALL-NEXT:    pushq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ALL-NEXT:    movq %rdi, (%rsp)
+; CHECK-ALL-NEXT:    callq f
+; CHECK-ALL-NEXT:  .Ltmp1:
+; CHECK-ALL-NEXT:    movq (%rsp), %rdi
+; CHECK-ALL-NEXT:    movq %rdi, %rsi
+; CHECK-ALL-NEXT:    xorl %eax, %eax
+; CHECK-ALL-NEXT:    callq use
+; CHECK-ALL-NEXT:    popq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-ALL-NEXT:    retq
+
+; CHECK-VREG-LABEL: test_gcptr_uniqueing:
+; CHECK-VREG:       # %bb.0:
+; CHECK-VREG-NEXT:    pushq %rbx
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-VREG-NEXT:    subq $16, %rsp
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-VREG-NEXT:    .cfi_offset %rbx, -16
+; CHECK-VREG-NEXT:    movq %rdi, 8(%rsp)
+; CHECK-VREG-NEXT:    callq f
+; CHECK-VREG-NEXT:  .Ltmp1:
+; CHECK-VREG-NEXT:    movq 8(%rsp), %rbx
+; CHECK-VREG-NEXT:    movq %rbx, %rdi
+; CHECK-VREG-NEXT:    movq %rbx, %rsi
+; CHECK-VREG-NEXT:    xorl %eax, %eax
+; CHECK-VREG-NEXT:    callq use
+; CHECK-VREG-NEXT:    addq $16, %rsp
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-VREG-NEXT:    popq %rbx
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-VREG-NEXT:    retq
   %ptr2 = bitcast i32 addrspace(1)* %ptr to i8 addrspace(1)*
   %tok = tail call token (i64, i32, void ()*, i32, i32, ...)
       @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @f, i32 0, i32 0, i32 0, i32 0) ["gc-live" (i32 addrspace(1)* %ptr, i8 addrspace(1)* %ptr2), "deopt" (i32 addrspace(1)* %ptr, i32 undef)]