diff --git a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
--- a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
+++ b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
@@ -99,6 +99,15 @@
   using StatepointSpillMapTy = DenseMap<const Value *, Optional<int>>;
   DenseMap<const Instruction *, StatepointSpillMapTy> StatepointSpillMaps;
 
+  /// For each statepoint keep mapping from original derived pointer to
+  /// the index of Statepoint node result defining its new value.
+  using DerivedPtrMapTy = DenseMap<const Value *, unsigned>;
+  DenseMap<const Instruction *, DerivedPtrMapTy> DerivedPtrMap;
+
+  /// For each statepoint keep virtual registers its result values has
+  /// been exported to.
+  DenseMap<const Instruction *, SmallVector<unsigned, 8> > StatepointRegs;
+
   /// StaticAllocaMap - Keep track of frame indices for fixed sized allocas in
   /// the entry block.  This allows the allocas to be efficiently referenced
   /// anywhere in the function.
diff --git a/llvm/include/llvm/CodeGen/StackMaps.h b/llvm/include/llvm/CodeGen/StackMaps.h
--- a/llvm/include/llvm/CodeGen/StackMaps.h
+++ b/llvm/include/llvm/CodeGen/StackMaps.h
@@ -166,21 +166,23 @@
   enum { CCOffset = 1, FlagsOffset = 3, NumDeoptOperandsOffset = 5 };
 
 public:
-  explicit StatepointOpers(const MachineInstr *MI) : MI(MI) {}
+  explicit StatepointOpers(const MachineInstr *MI) : MI(MI) {
+    NumDefs = MI->getNumDefs();
+  }
 
   /// Get index of statepoint ID operand.
-  unsigned getIDPos() const { return IDPos; }
+  unsigned getIDPos() const { return NumDefs + IDPos; }
 
   /// Get index of Num Patch Bytes operand.
-  unsigned getNBytesPos() const { return NBytesPos; }
+  unsigned getNBytesPos() const { return NumDefs + NBytesPos; }
 
   /// Get index of Num Call Arguments operand.
-  unsigned getNCallArgsPos() const { return NCallArgsPos; }
+  unsigned getNCallArgsPos() const { return NumDefs + NCallArgsPos; }
 
   /// Get starting index of non call related arguments
   /// (calling convention, statepoint flags, vm state and gc state).
   unsigned getVarIdx() const {
-    return MI->getOperand(NCallArgsPos).getImm() + MetaEnd;
+    return MI->getOperand(NumDefs + NCallArgsPos).getImm() + MetaEnd + NumDefs;
   }
 
   /// Get index of Calling Convention operand.
@@ -195,16 +197,16 @@
   }
 
   /// Return the ID for the given statepoint.
-  uint64_t getID() const { return MI->getOperand(IDPos).getImm(); }
+  uint64_t getID() const { return MI->getOperand(NumDefs + IDPos).getImm(); }
 
   /// Return the number of patchable bytes the given statepoint should emit.
   uint32_t getNumPatchBytes() const {
-    return MI->getOperand(NBytesPos).getImm();
+    return MI->getOperand(NumDefs + NBytesPos).getImm();
   }
 
   /// Return the target of the underlying call.
   const MachineOperand &getCallTarget() const {
-    return MI->getOperand(CallTargetPos);
+    return MI->getOperand(NumDefs + CallTargetPos);
   }
 
   /// Return the calling convention.
@@ -217,6 +219,7 @@
 
 private:
   const MachineInstr *MI;
+  unsigned NumDefs;
 };
 
 class StackMaps {
diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td
--- a/llvm/include/llvm/Target/Target.td
+++ b/llvm/include/llvm/Target/Target.td
@@ -1157,7 +1157,7 @@
   let usesCustomInserter = 1;
 }
 def STATEPOINT : StandardPseudoInstruction {
-  let OutOperandList = (outs);
+  let OutOperandList = (outs variable_ops);
   let InOperandList = (ins variable_ops);
   let usesCustomInserter = 1;
   let mayLoad = 1;
diff --git a/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp b/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
--- a/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
+++ b/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
@@ -46,8 +46,18 @@
     cl::desc("Allow spill in spill slot of greater size than register size"),
     cl::Hidden);
 
+static cl::opt<bool> PassGCPtrInCSR(
+    "fixup-allow-gcptr-in-csr", cl::Hidden, cl::init(false),
+    cl::desc("Allow passing GC Pointer arguments in callee saved registers"));
+
+static cl::opt<unsigned> MaxStatepointsWithRegs(
+    "fixup-max-csr-statepoints", cl::Hidden, cl::init(0),
+    cl::desc("Max number of statepoints allowed to pass GC Ptrs in registers"));
+
 namespace {
 
+class FrameIndexesCache;
+
 class FixupStatepointCallerSaved : public MachineFunctionPass {
 public:
   static char ID;
@@ -66,7 +76,12 @@
   }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  void collectGlobalFIs(MachineBasicBlock &BB, FrameIndexesCache &Cache,
+                        const TargetRegisterInfo *TRI);
 };
+
 } // End anonymous namespace.
 
 char FixupStatepointCallerSaved::ID = 0;
@@ -83,6 +98,49 @@
   return TRI.getSpillSize(*RC);
 }
 
+// Advance iterator to the next stack map entry
+static MachineInstr::const_mop_iterator
+advanceToNextStackMapElt(MachineInstr::const_mop_iterator MOI) {
+  if (MOI->isImm()) {
+    switch (MOI->getImm()) {
+    default:
+      llvm_unreachable("Unrecognized operand type.");
+    case StackMaps::DirectMemRefOp:
+      MOI += 2; // <Reg>, <Imm>
+      break;
+    case StackMaps::IndirectMemRefOp:
+      MOI += 3; // <Size>, <Reg>, <Imm>
+      break;
+    case StackMaps::ConstantOp:
+      MOI += 1;
+      break;
+    }
+  }
+  return ++MOI;
+}
+
+// Return statepoint GC args as a set
+static SmallSet<Register, 8> collectGCRegs(MachineInstr &MI) {
+  StatepointOpers SO(&MI);
+  unsigned VarIdx = SO.getVarIdx();
+  unsigned NumDeoptIdx = VarIdx + 5;
+  unsigned NumDeoptArgs = MI.getOperand(NumDeoptIdx).getImm();
+  MachineInstr::const_mop_iterator MOI(MI.operands_begin() + NumDeoptIdx + 1),
+      MOE(MI.operands_end());
+
+  // Skip deopt args
+  for (unsigned i = 0; i < NumDeoptArgs; ++i)
+    MOI = advanceToNextStackMapElt(MOI);
+
+  SmallSet<Register, 8> Result;
+  while (MOI != MOE) {
+    if (MOI->isReg() && !MOI->isImplicit())
+      Result.insert(MOI->getReg());
+    MOI = advanceToNextStackMapElt(MOI);
+  }
+  return Result;
+}
+
 namespace {
 // Cache used frame indexes during statepoint re-write to re-use them in
 // processing next statepoint instruction.
@@ -105,6 +163,13 @@
   // size will be increased.
   DenseMap<unsigned, FrameIndexesPerSize> Cache;
 
+  // Landing pad can be destination of several statepoints. Every register
+  // defined by such statepoints must be spilled to the same stack slot.
+  // This map keeps that information.
+  // NOTE: we assume that spill slot live ranges do not intersect.
+  using RegStatepointPair = std::pair<unsigned, MachineInstr *>;
+  DenseMap<RegStatepointPair, int> GlobalIndices;
+
 public:
   FrameIndexesCache(MachineFrameInfo &MFI, const TargetRegisterInfo &TRI)
       : MFI(MFI), TRI(TRI) {}
@@ -114,8 +179,19 @@
     for (auto &It : Cache)
       It.second.Index = 0;
   }
+
   // Get frame index to spill the register.
-  int getFrameIndex(Register Reg) {
+  int getFrameIndex(Register Reg, MachineInstr *MI = nullptr) {
+    if (MI) {
+      auto It = GlobalIndices.find(std::make_pair(Reg, MI));
+      if (It != GlobalIndices.end()) {
+        int FI = It->second;
+        LLVM_DEBUG(dbgs() << "Found global FI " << FI << " for register "
+                          << printReg(Reg, &TRI) << " at " << *MI);
+        return FI;
+      }
+    }
+
     unsigned Size = getRegisterSize(TRI, Reg);
     // In FixupSCSExtendSlotSize mode the bucket with 0 index is used
     // for all sizes.
@@ -148,8 +224,32 @@
       return getRegisterSize(TRI, A) > getRegisterSize(TRI, B);
     });
   }
+
+  // Record frame index to be used to spill register \p Reg at instr \p MI.
+  void addGlobalSpillSlot(Register Reg, MachineInstr *MI, int FI) {
+    auto P = std::make_pair(Reg, MI);
+    GlobalIndices.insert(std::make_pair(P, FI));
+  }
 };
 
+// Check if we already inserted reload of register Reg from spill slot FI
+// in basic block MBB.
+// This can happen in EH pad block which is successor of several
+// statepoints.
+static bool hasRegReload(Register Reg, int FI, MachineBasicBlock *MBB,
+                         const TargetInstrInfo *TII,
+                         const TargetRegisterInfo *TRI) {
+  auto I = MBB->SkipPHIsLabelsAndDebug(MBB->begin()), E = MBB->end();
+  int Dummy;
+  for (; I != E; ++I) {
+    if (TII->isLoadFromStackSlot(*I, Dummy) == Reg && Dummy == FI)
+      return true;
+    if (I->modifiesRegister(Reg, TRI) || I->readsRegister(Reg, TRI))
+      return false;
+  }
+  return false;
+}
+
 // Describes the state of the current processing statepoint instruction.
 class StatepointState {
 private:
@@ -163,6 +263,7 @@
   const uint32_t *Mask;
   // Cache of frame indexes used on previous instruction processing.
   FrameIndexesCache &CacheFI;
+  bool AllowGCPtrInCSR;
   // Operands with physical registers requiring spilling.
   SmallVector<unsigned, 8> OpsToSpill;
   // Set of register to spill.
@@ -172,17 +273,20 @@
 
 public:
   StatepointState(MachineInstr &MI, const uint32_t *Mask,
-                  FrameIndexesCache &CacheFI)
+                  FrameIndexesCache &CacheFI, bool AllowGCPtrInCSR)
       : MI(MI), MF(*MI.getMF()), TRI(*MF.getSubtarget().getRegisterInfo()),
         TII(*MF.getSubtarget().getInstrInfo()), MFI(MF.getFrameInfo()),
-        Mask(Mask), CacheFI(CacheFI) {}
+        Mask(Mask), CacheFI(CacheFI), AllowGCPtrInCSR(AllowGCPtrInCSR) {}
+
   // Return true if register is callee saved.
   bool isCalleeSaved(Register Reg) { return (Mask[Reg / 32] >> Reg % 32) & 1; }
+
   // Iterates over statepoint meta args to find caller saver registers.
   // Also cache the size of found registers.
   // Returns true if caller save registers found.
   bool findRegistersToSpill() {
     SmallSet<Register, 8> VisitedRegs;
+    SmallSet<Register, 8> GCRegs = collectGCRegs(MI);
     for (unsigned Idx = StatepointOpers(&MI).getVarIdx(),
                   EndIdx = MI.getNumOperands();
          Idx < EndIdx; ++Idx) {
@@ -191,8 +295,14 @@
         continue;
       Register Reg = MO.getReg();
       assert(Reg.isPhysical() && "Only physical regs are expected");
-      if (isCalleeSaved(Reg))
+
+      if (isCalleeSaved(Reg) &&
+          (AllowGCPtrInCSR || !is_contained(GCRegs, Reg)))
         continue;
+
+      LLVM_DEBUG(dbgs() << "Will spill " << printReg(Reg, &TRI) << " at index "
+                 << Idx << "\n");
+
       if (VisitedRegs.insert(Reg).second)
         RegsToSpill.push_back(Reg);
       OpsToSpill.push_back(Idx);
@@ -200,18 +310,95 @@
     CacheFI.sortRegisters(RegsToSpill);
     return !RegsToSpill.empty();
   }
+
   // Spill all caller saved registers right before statepoint instruction.
   // Remember frame index where register is spilled.
   void spillRegisters() {
     for (Register Reg : RegsToSpill) {
-      int FI = CacheFI.getFrameIndex(Reg);
+      int FI = CacheFI.getFrameIndex(Reg, &MI);
       const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg);
-      TII.storeRegToStackSlot(*MI.getParent(), MI, Reg, true /*is_Kill*/, FI,
-                              RC, &TRI);
+
       NumSpilledRegisters++;
       RegToSlotIdx[Reg] = FI;
+
+      LLVM_DEBUG(dbgs() << "Spilling " << printReg(Reg, &TRI) << "\n");
+      bool isKill = true;
+      MachineInstr *InsertBefore = &MI;
+
+      // Perform trivial copy propagation
+      MachineBasicBlock *MBB = MI.getParent();
+      MachineBasicBlock::reverse_iterator B(MI);
+      MachineInstr *Def = nullptr, *Use = nullptr;
+      for (auto It = std::next(B); It != MBB->rend(); ++It) {
+        if (It->readsRegister(Reg, &TRI) && !Use)
+          Use = &*It;
+        if (It->modifiesRegister(Reg, &TRI)) {
+          Def = &*It;
+          break;
+        }
+      }
+      if (Def)
+        if (auto DestSrc = TII.isCopyInstr(*Def))
+          if (DestSrc->Destination->getReg() == Reg) {
+            Register SrcReg = DestSrc->Source->getReg();
+            LLVM_DEBUG(dbgs() << "spillRegisters: perform copy propagation "
+                       << printReg(Reg, &TRI) << " -> " << printReg(SrcReg, &TRI)
+                       << "\n");
+            Reg = SrcReg;
+            isKill = DestSrc->Source->isKill();
+            InsertBefore = Def->getNextNode();
+            if (!Use)
+              Def->eraseFromParent();
+          }
+
+      LLVM_DEBUG(dbgs() << "Insert spill before " << *InsertBefore);
+      TII.storeRegToStackSlot(*MI.getParent(), InsertBefore, Reg, isKill, FI,
+                              RC, &TRI);
+    }
+  }
+
+  void insertReloadBefore(unsigned Reg, MachineBasicBlock::iterator It,
+                          MachineBasicBlock *MBB) {
+    const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg);
+    int FI = RegToSlotIdx[Reg];
+    if (It != MBB->end()) {
+      TII.loadRegFromStackSlot(*MBB, It, Reg, FI, RC, &TRI);
+      return;
     }
+
+    // To insert reload at the end of MBB, insert it before last instruction
+    // and then swap them.
+    assert(MBB->begin() != MBB->end() && "Empty block");
+    --It;
+    TII.loadRegFromStackSlot(*MBB, It, Reg, FI, RC, &TRI);
+    MachineInstr *Reload = It->getPrevNode();
+    int Dummy = 0;
+    assert(TII.isLoadFromStackSlot(*Reload, Dummy) == Reg);
+    assert(Dummy == FI);
+    MBB->remove(Reload);
+    MBB->insertAfter(It, Reload);
   }
+
+  // Insert reload of register Reg after it has been spilled in statepoint.
+  void insertReloads(unsigned Reg) {
+    MachineBasicBlock *MBB = MI.getParent();
+    auto It = MI.getIterator();
+    insertReloadBefore(Reg, ++It, MBB);
+
+    // Invoke statepoint must be last one in block.
+    if (std::any_of(It, MBB->end().getInstrIterator(),
+                    [](MachineInstr &I) { return I.getOpcode() == TargetOpcode::STATEPOINT; }))
+      return;
+
+    int FI = RegToSlotIdx[Reg];
+    for (auto Succ : MBB->successors()) {
+      if (!Succ->isEHPad() || hasRegReload(Reg, FI, Succ, &TII, &TRI))
+        continue;
+      auto It = Succ->SkipPHIsLabelsAndDebug(Succ->begin());
+      insertReloadBefore(Reg, It, Succ);
+    }
+  }
+
   // Re-write statepoint machine instruction to replace caller saved operands
   // with indirect memory location (frame index).
   void rewriteStatepoint() {
@@ -219,11 +406,36 @@
         MF.CreateMachineInstr(TII.get(MI.getOpcode()), MI.getDebugLoc(), true);
     MachineInstrBuilder MIB(MF, NewMI);
 
+    unsigned NumOps = MI.getNumOperands();
+
+    // Set of registers to reload after statepoint.
+    SmallVector<Register, 8> RegsToReload;
+    // New indices for the remaining defs.
+    SmallVector<unsigned, 8> NewIndices;
+    unsigned NumDefs = MI.getNumDefs();
+    for (unsigned I = 0; I < NumDefs; ++I) {
+      MachineOperand &DefMO = MI.getOperand(I);
+      assert(DefMO.isReg() && DefMO.isDef() && "Expected Reg Def operand");
+      Register Reg = DefMO.getReg();
+      if (!AllowGCPtrInCSR) {
+        assert(is_contained(RegsToSpill, Reg));
+        RegsToReload.push_back(Reg);
+      } else {
+        if (isCalleeSaved(Reg)) {
+          NewIndices.push_back(NewMI->getNumOperands());
+          MIB.addReg(Reg, RegState::Define);
+        } else {
+          NewIndices.push_back(NumOps);
+          RegsToReload.push_back(Reg);
+        }
+      }
+    }
+
     // Add End marker.
     OpsToSpill.push_back(MI.getNumOperands());
     unsigned CurOpIdx = 0;
 
-    for (unsigned I = 0; I < MI.getNumOperands(); ++I) {
+    for (unsigned I = NumDefs; I < MI.getNumOperands(); ++I) {
       MachineOperand &MO = MI.getOperand(I);
       if (I == OpsToSpill[CurOpIdx]) {
         int FI = RegToSlotIdx[MO.getReg()];
@@ -234,8 +446,15 @@
         MIB.addFrameIndex(FI);
         MIB.addImm(0);
         ++CurOpIdx;
-      } else
+      } else {
         MIB.add(MO);
+        unsigned OldDef;
+        if (AllowGCPtrInCSR && MI.isRegTiedToDefOperand(I, &OldDef)) {
+          assert(OldDef < NumDefs);
+          assert(NewIndices[OldDef] < NumOps);
+          MIB->tieOperands(NewIndices[OldDef], MIB->getNumOperands() - 1);
+        }
+      }
     }
     assert(CurOpIdx == (OpsToSpill.size() - 1) && "Not all operands processed");
     // Add mem operands.
@@ -248,8 +467,14 @@
                                           MFI.getObjectAlign(FrameIndex));
       NewMI->addMemOperand(MF, MMO);
     }
+
     // Insert new statepoint and erase old one.
     MI.getParent()->insert(MI, NewMI);
+
+    for (Register Reg : RegsToReload)
+      insertReloads(Reg);
+
+    LLVM_DEBUG(dbgs() << "rewritten statepoint to : " << *NewMI << "\n");
     MI.eraseFromParent();
   }
 };
@@ -265,16 +490,22 @@
       : MF(MF), TRI(*MF.getSubtarget().getRegisterInfo()),
         CacheFI(MF.getFrameInfo(), TRI) {}
 
-  bool process(MachineInstr &MI) {
+  StatepointProcessor(MachineFunction &MF, FrameIndexesCache &Cache)
+      : MF(MF), TRI(*MF.getSubtarget().getRegisterInfo()), CacheFI(Cache) {}
+
+  bool process(MachineInstr &MI, bool AllowGCPtrInCSR) {
     StatepointOpers SO(&MI);
     uint64_t Flags = SO.getFlags();
     // Do nothing for LiveIn, it supports all registers.
     if (Flags & (uint64_t)StatepointFlags::DeoptLiveIn)
       return false;
+    LLVM_DEBUG(dbgs() << "\nMBB " << MI.getParent()->getNumber() << " "
+                      << MI.getParent()->getName() << " : process statepoint "
+                      << MI);
     CallingConv::ID CC = SO.getCallingConv();
     const uint32_t *Mask = TRI.getCallPreservedMask(MF, CC);
     CacheFI.reset();
-    StatepointState SS(MI, Mask, CacheFI);
+    StatepointState SS(MI, Mask, CacheFI, AllowGCPtrInCSR);
 
     if (!SS.findRegistersToSpill())
       return false;
@@ -286,6 +517,84 @@
 };
 } // namespace
 
+// Return live out definition of Reg in MBB or null
+static MachineInstr *findLiveOutDef(Register Reg, MachineBasicBlock *MBB,
+                                    const TargetRegisterInfo *TRI) {
+  for (auto I = MBB->rbegin(), E = MBB->rend(); I != E; ++I) {
+    // Special case for statepoint because
+    //  - register mask modifies register;
+    //  - for patchable call with ZingICC calling convention we can have:
+    //       STATEPOINT 1, 16, 2, undef renamable $rax, $eax, ... implicit-def
+    //       $eax
+    //    Note that [re]ax is used as call target AND first parameter and marked
+    //    as implicit def
+    if (I->getOpcode() == TargetOpcode::STATEPOINT)
+      for (unsigned i = 0; i < I->getNumOperands(); ++i) {
+        MachineOperand &MO = I->getOperand(i);
+        if (!MO.isReg() || !MO.isDef())
+          return nullptr;
+        if (MO.getReg() == Reg)
+          return &*I;
+      }
+    if (I->modifiesRegister(Reg, TRI))
+      return &*I;
+  }
+  return nullptr;
+}
+
+// For EH pad block with multiple predecessors check if its live-in
+// registers are defined by statepoints in preds. If so, assign same
+// spill slot for each register at each statepoint.
+// NOTE: It works only if all reaching definitions of register are statepoints,
+// otherwise we cannot insert reload into EH pad and must insert multiple
+// reloads on edges.
+// ASSUMPTION: live ranges of spill slots from different statepoints do not
+// intersect.
+void FixupStatepointCallerSaved::collectGlobalFIs(
+    MachineBasicBlock &BB, FrameIndexesCache &Cache,
+    const TargetRegisterInfo *TRI) {
+  // NOTE: I've seen dead registers marked as live-ins in block.
+  //       That's OK for us, but if it is broke the other way
+  //       (live register not in live-in), we're screwed up.
+  if (!BB.isEHPad() || BB.livein_empty() || BB.pred_size() == 1)
+    return;
+  SmallVector<MachineBasicBlock *, 4> Preds(BB.predecessors());
+  auto isStatepoint = [](MachineInstr *I) {
+    return I && I->getOpcode() == TargetOpcode::STATEPOINT;
+  };
+
+  // Resetting the cache allows us to reuse stack slots between
+  // different 'statepoint sets' (a set of statepoints reaching
+  // same EH Pad). This works under assumption that we allocate
+  // these 'global' spill slots before starting to process
+  // individual statepoints.
+  Cache.reset();
+
+  for (auto &LI : BB.liveins()) {
+    Register Reg = LI.PhysReg;
+    SmallVector<MachineInstr *, 4> RegDefs;
+    for (auto *B : Preds)
+      RegDefs.push_back(findLiveOutDef(Reg, B, TRI));
+    if (llvm::all_of(RegDefs, isStatepoint)) {
+      int FI = Cache.getFrameIndex(Reg);
+      for (auto *Def : RegDefs) {
+        Cache.addGlobalSpillSlot(Reg, Def, FI);
+        LLVM_DEBUG(dbgs() << "EH Pad bb." << BB.getNumber() << ": reserving FI "
+                          << FI << " to spill register " << printReg(Reg, TRI)
+                          << " at statepoint in bb."
+                          << Def->getParent()->getNumber() << "\n");
+      }
+    } else {
+      // That spilling stuff is all-or-nothing: either all defining instructions
+      // are statepoints (and we can spill to the same slot) or none of them are
+      // statepoints (so we do not need any reloads). Otherwise we're in
+      // trouble.
+      assert(llvm::none_of(RegDefs, isStatepoint) &&
+             "Cannot safely reload register");
+    }
+  }
+}
+
 bool FixupStatepointCallerSaved::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -294,18 +603,30 @@
   if (!F.hasGC())
     return false;
 
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  FrameIndexesCache FICache(MF.getFrameInfo(), *TRI);
+
   SmallVector<MachineInstr *, 16> Statepoints;
-  for (MachineBasicBlock &BB : MF)
+  for (MachineBasicBlock &BB : MF) {
+    collectGlobalFIs(BB, FICache, TRI);
     for (MachineInstr &I : BB)
       if (I.getOpcode() == TargetOpcode::STATEPOINT)
         Statepoints.push_back(&I);
+  }
 
   if (Statepoints.empty())
     return false;
 
   bool Changed = false;
-  StatepointProcessor SPP(MF);
-  for (MachineInstr *I : Statepoints)
-    Changed |= SPP.process(*I);
+  StatepointProcessor SPP(MF, FICache);
+  unsigned NumStatepoints = 0;
+  bool AllowGCPtrInCSR = PassGCPtrInCSR;
+  for (MachineInstr *I : Statepoints) {
+    ++NumStatepoints;
+    if (MaxStatepointsWithRegs.getNumOccurrences() &&
+        NumStatepoints >= MaxStatepointsWithRegs)
+      AllowGCPtrInCSR = false;
+    Changed |= SPP.process(*I, AllowGCPtrInCSR);
+  }
   return Changed;
 }
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -810,6 +810,8 @@
   bool WasCopy = MI->isCopy();
   unsigned ImpReg = 0;
 
+  bool UntieRegs = MI->getOpcode() == TargetOpcode::STATEPOINT;
+
   // Spill subregs if the target allows it.
   // We always want to spill subregs for stackmap/patchpoint pseudos.
   bool SpillSubRegs = TII.isSubregFoldable() ||
@@ -829,6 +831,9 @@
       continue;
     }
 
+    if (UntieRegs && MO.isTied())
+      MI->untieRegOperand(Idx);
+
     if (!SpillSubRegs && MO.getSubReg())
       return false;
     // We cannot fold a load instruction into a def.
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -1545,6 +1545,9 @@
   if (MCID.getOpcode() == TargetOpcode::PATCHPOINT)
     NumDefs = (MONum == 0 && MO->isReg()) ? NumDefs : 0;
 
+  if (MCID.getOpcode() == TargetOpcode::STATEPOINT)
+    NumDefs = MI->getNumDefs();
+
   // The first MCID.NumDefs operands must be explicit register defines
   if (MONum < NumDefs) {
     const MCOperandInfo &MCOI = MCID.OpInfo[MONum];
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -82,6 +82,27 @@
   return N;
 }
 
+/// Return starting index of GC operand list.
+// FIXME: need a better place for this. Put it in StackMaps?
+static unsigned getStatepointGCArgStartIdx(MachineInstr *MI) {
+  assert(MI->getOpcode() == TargetOpcode::STATEPOINT &&
+         "STATEPOINT node expected");
+  unsigned OperIdx = StatepointOpers(MI).getNumDeoptArgsIdx();
+  unsigned NumDeopts = MI->getOperand(OperIdx).getImm();
+  // At this point stack references has not been lowered yet, so they
+  // take single operand.
+  ++OperIdx;
+  for (unsigned i = 0; i < NumDeopts; ++i) {
+    MachineOperand &MO = MI->getOperand(OperIdx);
+    if (MO.isImm() && MO.getImm() == StackMaps::ConstantOp) {
+      ++OperIdx;
+      assert(MI->getOperand(OperIdx).isImm() && "Unexpected statepoint operand");
+    }
+    ++OperIdx;
+  }
+  return OperIdx;
+}
+
 /// EmitCopyFromReg - Generate machine code for an CopyFromReg node or an
 /// implicit physical register output.
 void InstrEmitter::
@@ -200,6 +221,8 @@
   bool HasVRegVariadicDefs = !MF->getTarget().usesPhysRegsForValues() &&
                              II.isVariadic() && II.variadicOpsAreDefs();
   unsigned NumVRegs = HasVRegVariadicDefs ? NumResults : II.getNumDefs();
+  if (Node->getMachineOpcode() == TargetOpcode::STATEPOINT)
+    NumVRegs = NumResults;
   for (unsigned i = 0; i < NumVRegs; ++i) {
     // If the specific node value is only used by a CopyToReg and the dest reg
     // is a vreg in the same register class, use the CopyToReg'd destination
@@ -821,6 +844,8 @@
       NumDefs = NumResults;
     }
     ScratchRegs = TLI->getScratchRegisters((CallingConv::ID) CC);
+  } else if (Opc == TargetOpcode::STATEPOINT) {
+    NumDefs = NumResults;
   }
 
   unsigned NumImpUses = 0;
@@ -970,6 +995,17 @@
   if (!UsedRegs.empty() || II.getImplicitDefs() || II.hasOptionalDef())
     MIB->setPhysRegsDeadExcept(UsedRegs, *TRI);
 
+  // STATEPOINT is too 'dynamic' to have meaningful machine description.
+  // We have to manually tie operands.
+  if (Opc == TargetOpcode::STATEPOINT && NumDefs > 0) {
+    assert(!HasPhysRegOuts && "STATEPOINT mishandled");
+    MachineInstr *MI = MIB;
+    unsigned GCArgsStart = getStatepointGCArgStartIdx(MI);
+    unsigned Use = GCArgsStart + 1;
+    for (unsigned Def = 0; Def < NumDefs; ++Def, Use += 2)
+      MI->tieOperands(Def, Use);
+  }
+
   // Run post-isel target hook to adjust this instruction if needed.
   if (II.hasPostISelHook())
     TLI->AdjustInstrPostInstrSelection(*MIB, Node);
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -125,8 +125,7 @@
     PhysReg = Reg;
   } else if (Def->isMachineOpcode()) {
     const MCInstrDesc &II = TII->get(Def->getMachineOpcode());
-    if (ResNo >= II.getNumDefs() &&
-        II.ImplicitDefs[ResNo - II.getNumDefs()] == Reg)
+    if (ResNo >= II.getNumDefs() && II.hasImplicitDefOfPhysReg(Reg))
       PhysReg = Reg;
   }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -66,6 +66,10 @@
     "use-registers-for-deopt-values", cl::Hidden, cl::init(false),
     cl::desc("Allow using registers for non pointer deopt args"));
 
+cl::opt<bool> UseRegistersForGCPointers(
+    "use-registers-for-gcptrs", cl::Hidden, cl::init(false),
+    cl::desc("Allow using registers for GC pointer meta args"));
+
 static void pushStackMapConstant(SmallVectorImpl<SDValue>& Ops,
                                  SelectionDAGBuilder &Builder, uint64_t Value) {
   SDLoc L = Builder.getCurSDLoc();
@@ -220,6 +224,14 @@
   return None;
 }
 
+// Return true if V is a values which need not to be relocated/spilled.
+static bool isConstantVal(SDValue V) {
+  if (V.getValueSizeInBits() > 64)
+    return false;
+  return (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V) ||
+          isa<FrameIndexSDNode>(V) || V.isUndef());
+}
+
 /// Try to find existing copies of the incoming values in stack slots used for
 /// statepoint spilling.  If we can find a spill slot for the incoming value,
 /// mark that slot as allocated, and reuse the same slot for this safepoint.
@@ -229,12 +241,8 @@
                                              SelectionDAGBuilder &Builder) {
   SDValue Incoming = Builder.getValue(IncomingValue);
 
-  if (isa<ConstantSDNode>(Incoming) || isa<ConstantFPSDNode>(Incoming) ||
-      isa<FrameIndexSDNode>(Incoming) || Incoming.isUndef()) {
-    // We won't need to spill this, so no need to check for previously
-    // allocated stack slots
+  if (isConstantVal(Incoming))
     return;
-  }
 
   SDValue OldLocation = Builder.StatepointLowering.getLocation(Incoming);
   if (OldLocation.getNode())
@@ -274,6 +282,29 @@
   Builder.StatepointLowering.setLocation(Incoming, Loc);
 }
 
+/// Sort Ptrs vector so that constants, allocas and undefs
+/// contiguously occupy end of vector.
+/// Synchroniously update Bases and Relocs vectors.
+static unsigned sortGCPtrs(SmallVectorImpl<const Value *> &Bases,
+                           SmallVectorImpl<const Value *> &Ptrs,
+                           SmallVectorImpl<const GCRelocateInst *> &Relocs,
+                           SelectionDAGBuilder &Builder) {
+  unsigned curPos = 0;
+  for (unsigned i = 0, e = Ptrs.size(); i < e; ++i) {
+    SDValue SDV = Builder.getValue(Ptrs[i]);
+    if (isConstantVal(SDV) || SDV.getValueType().getSizeInBits() > 64) {
+      continue;
+    }
+    if (curPos < i) {
+      std::swap(Bases[curPos], Bases[i]);
+      std::swap(Ptrs[curPos], Ptrs[i]);
+      std::swap(Relocs[curPos], Relocs[i]);
+    }
+    ++curPos;
+  }
+  return curPos;
+}
+
 /// Extract call from statepoint, lower it and return pointer to the
 /// call node. Also update NodeMap so that getValue(statepoint) will
 /// reference lowered call result
@@ -367,7 +398,7 @@
                                  StoreMMO);
 
     MMO = getMachineMemOperand(MF, *cast<FrameIndexSDNode>(Loc));
-    
+
     Builder.StatepointLowering.setLocation(Incoming, Loc);
   }
 
@@ -456,7 +487,9 @@
 /// will be set to the last value spilled (if any were).
 static void
 lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
-                        SmallVectorImpl<MachineMemOperand*> &MemRefs,                                    SelectionDAGBuilder::StatepointLoweringInfo &SI,
+                        SmallVectorImpl<MachineMemOperand *> &MemRefs,
+                        unsigned NumVRegGCArgs,
+                        SelectionDAGBuilder::StatepointLoweringInfo &SI,
                         SelectionDAGBuilder &Builder) {
   // Lower the deopt and gc arguments for this statepoint.  Layout will be:
   // deopt argument length, deopt arguments.., gc arguments...
@@ -513,7 +546,9 @@
   };
 
   auto requireSpillSlot = [&](const Value *V) {
-    return !(LiveInDeopt || UseRegistersForDeoptValues) || isGCValue(V);
+    if (isGCValue(V))
+      return !UseRegistersForGCPointers || V->getType()->isVectorTy();
+    return !(LiveInDeopt || UseRegistersForDeoptValues);
   };
 
   // Before we actually start lowering (and allocating spill slots for values),
@@ -525,7 +560,7 @@
     if (requireSpillSlot(V))
       reservePreviousStackSlotForValue(V, Builder);
   }
-  for (unsigned i = 0; i < SI.Bases.size(); ++i) {
+  for (unsigned i = NumVRegGCArgs; i < SI.Bases.size(); ++i) {
     reservePreviousStackSlotForValue(SI.Bases[i], Builder);
     reservePreviousStackSlotForValue(SI.Ptrs[i], Builder);
   }
@@ -558,16 +593,28 @@
   // arrays interwoven with each (lowered) base pointer immediately followed by
   // it's (lowered) derived pointer.  i.e
   // (base[0], ptr[0], base[1], ptr[1], ...)
+  // Lower first `NumVRegGCArgs` base AND derived pointers through VRegs.
+  // In future we might use more sophisticated strategy for choosing which
+  // pointers to pass via virtual registers, but for now this simple approach
+  // looks good enough. Take into account these facts:
+  //  - NumVRegGCArgs is limited by the  max number of tied registers in MI;
+  //  - We relocate (and so need tied defs for) only derived pointers;
+  //  - Quite often base and derived pointer are the same.
+  auto &SL = Builder.StatepointLowering;
   for (unsigned i = 0; i < SI.Bases.size(); ++i) {
+    bool RequireSpillSlot = (i >= NumVRegGCArgs);
     const Value *Base = SI.Bases[i];
     lowerIncomingStatepointValue(Builder.getValue(Base),
-                                 /*RequireSpillSlot*/ true, Ops, MemRefs,
+                                 RequireSpillSlot, Ops, MemRefs,
                                  Builder);
 
     const Value *Ptr = SI.Ptrs[i];
-    lowerIncomingStatepointValue(Builder.getValue(Ptr),
-                                 /*RequireSpillSlot*/ true, Ops, MemRefs,
+    SDValue SDV = Builder.getValue(Ptr);
+    lowerIncomingStatepointValue(SDV,
+                                 RequireSpillSlot, Ops, MemRefs,
                                  Builder);
+    if (!RequireSpillSlot && !SL.getLocation(SDV))
+      SL.setLocation(SDV, Builder.DAG.getConstant(i, SDLoc(), MVT::i64));
   }
 
   // If there are any explicit spill slots passed to the statepoint, record
@@ -595,6 +642,7 @@
   // values, while previous loops account only values with unique SDValues.
   const Instruction *StatepointInstr = SI.StatepointInstr;
   auto &SpillMap = Builder.FuncInfo.StatepointSpillMaps[StatepointInstr];
+  auto &DPtrMap = Builder.FuncInfo.DerivedPtrMap[StatepointInstr];
 
   for (const GCRelocateInst *Relocate : SI.GCRelocates) {
     const Value *V = Relocate->getDerivedPtr();
@@ -602,7 +650,12 @@
     SDValue Loc = Builder.StatepointLowering.getLocation(SDV);
 
     if (Loc.getNode()) {
-      SpillMap[V] = cast<FrameIndexSDNode>(Loc)->getIndex();
+      if (auto FI = dyn_cast<FrameIndexSDNode>(Loc)) {
+        SpillMap[V] = FI->getIndex();
+      } else {
+        DPtrMap[V] = cast<ConstantSDNode>(Loc)->getZExtValue();
+        SpillMap[V] = None;
+      }
     } else {
       // Record value as visited, but not spilled. This is case for allocas
       // and constants. For this values we can avoid emitting spill load while
@@ -642,10 +695,26 @@
       StatepointLowering.scheduleRelocCall(*Reloc);
 #endif
 
+  unsigned NumVRegs = 0;
+
+  if (UseRegistersForGCPointers) {
+    const unsigned MaxTiedRegs = 15U;
+
+    // Sort vectors so that elements which need relocation are laid out
+    // contiguously at the beginning of vectors.
+    // This is dictated by the SDNode implementation: due to size limit, one
+    // cannot put vector into class derived from SDNode. So we map N results
+    // of Statepoint node to the first N derived pointers.
+    NumVRegs = sortGCPtrs(SI.Bases, SI.Ptrs, SI.GCRelocates, *this);
+    NumVRegs = std::min(NumVRegs, MaxTiedRegs);
+  }
+
+  LLVM_DEBUG(dbgs() << "NumVRegs = " << NumVRegs << "\n");
+
   // Lower statepoint vmstate and gcstate arguments
   SmallVector<SDValue, 10> LoweredMetaArgs;
   SmallVector<MachineMemOperand*, 16> MemRefs;
-  lowerStatepointMetaArgs(LoweredMetaArgs, MemRefs, SI, *this);
+  lowerStatepointMetaArgs(LoweredMetaArgs, MemRefs, NumVRegs, SI, *this);
 
   // Now that we've emitted the spills, we need to update the root so that the
   // call sequence is ordered correctly.
@@ -757,10 +826,13 @@
   if (Glue.getNode())
     Ops.push_back(Glue);
 
-  // Compute return values.  Provide a glue output since we consume one as
-  // input.  This allows someone else to chain off us as needed.
-  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SmallVector<EVT, 8> NodeTys;
+  for (unsigned i = 0; i < NumVRegs; ++i)
+    NodeTys.push_back(getValue(SI.Ptrs[i]).getValueType());
+  NodeTys.push_back(MVT::Other);
+  NodeTys.push_back(MVT::Glue);
 
+  unsigned NumResults = NodeTys.size();
   MachineSDNode *StatepointMCNode =
     DAG.getMachineNode(TargetOpcode::STATEPOINT, getCurSDLoc(), NodeTys, Ops);
   DAG.setNodeMemRefs(StatepointMCNode, MemRefs);
@@ -775,7 +847,7 @@
     SmallVector<SDValue, 8> TEOps;
 
     // Add chain
-    TEOps.push_back(SDValue(StatepointMCNode, 0));
+    TEOps.push_back(SDValue(StatepointMCNode, NumResults - 2));
 
     // Add GC transition arguments
     for (const Value *V : SI.GCTransitionArgs) {
@@ -785,7 +857,7 @@
     }
 
     // Add glue
-    TEOps.push_back(SDValue(StatepointMCNode, 1));
+    TEOps.push_back(SDValue(StatepointMCNode, NumResults - 1));
 
     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
 
@@ -796,7 +868,12 @@
   }
 
   // Replace original call
-  DAG.ReplaceAllUsesWith(CallNode, SinkNode); // This may update Root
+  // Call: ch,glue = CALL ...
+  // Statepoint: [gc relocates],ch,glue = STATEPOINT ...
+  unsigned NumSinkValues = SinkNode->getNumValues();
+  SDValue StatepointValues[2] = {SDValue(SinkNode, NumSinkValues - 2),
+                                 SDValue(SinkNode, NumSinkValues - 1)};
+  DAG.ReplaceAllUsesWith(CallNode, StatepointValues);
   // Remove original call node
   DAG.DeleteNode(CallNode);
 
@@ -809,7 +886,11 @@
   // previously emitted STATEPOINT value.  Unfortunately, this doesn't appear
   // to actually be possible today.
 
-  return ReturnVal;
+  // SDValue must have type to be used as MERGE_VALUES operand. Use void UNDEF
+  // as a placeholder for void functions.
+  if (!ReturnVal)
+    ReturnVal = DAG.getUNDEF(MVT::isVoid);
+  return DAG.getMergeValues({ReturnVal, SDValue(StatepointMCNode, 0)}, getCurSDLoc());
 }
 
 void
@@ -879,21 +960,48 @@
   SI.NumPatchBytes = I.getNumPatchBytes();
   SI.EHPadBB = EHPadBB;
 
-  SDValue ReturnValue = LowerAsSTATEPOINT(SI);
+  SDValue Merge = LowerAsSTATEPOINT(SI);
+  assert(Merge->getOpcode() == ISD::MERGE_VALUES);
 
   // Export the result value if needed
+  const BasicBlock *BB = I.getParent();
+  std::vector<const GCRelocateInst *> RV = I.getGCRelocates();
+  bool NeedExport = llvm::any_of(
+      RV, [&BB](const GCRelocateInst *R) { return R->getParent() != BB; });
+
+  // If any of relocates or result value will be used in different basic
+  // block, we need to export them manually. Default exporting mechanism
+  // will not work here because it is based on IR Value types, and
+  // IR statepoint has different type than the actual call or relocates.
+  // It means that by default llvm will create export register of the wrong
+  // type (always i32 - TokenTy - in our case). So instead we need to create
+  // export registers manually.
+  // TODO: To eliminate this problem we can remove gc.result/gc.relocate
+  //       intrinsics completely and make statepoint call to return a tuple.
+  setValue(&I, Merge);
+  if (NeedExport) {
+    LLVMContext *Context = DAG.getContext();
+    SDNode *STV = Merge->getOperand(1).getNode();
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    const DataLayout &DL = DAG.getDataLayout();
+    for (unsigned i = 0, e = STV->getNumValues() - 2; i < e; ++i) {
+      Value *DerivedPtr = SI.GCRelocates[i]->getDerivedPtr();
+      SDValue Res(STV, i);
+      Type *Ty = DerivedPtr->getType();
+      unsigned Reg = FuncInfo.CreateRegs(Ty);
+      RegsForValue RFV(*Context, TLI, DL, Reg, Ty, None);
+      SDValue Chain = DAG.getEntryNode();
+
+      RFV.getCopyToRegs(Res, DAG, getCurSDLoc(), Chain, nullptr, DerivedPtr);
+      PendingExports.push_back(Chain);
+      FuncInfo.StatepointRegs[SI.StatepointInstr].push_back(Reg);
+    }
+  }
   const GCResultInst *GCResult = I.getGCResult();
   Type *RetTy = I.getActualReturnType();
   if (!RetTy->isVoidTy() && GCResult) {
     if (GCResult->getParent() != I.getParent()) {
-      // Result value will be used in a different basic block so we need to
-      // export it now.  Default exporting mechanism will not work here because
-      // statepoint call has a different type than the actual call. It means
-      // that by default llvm will create export register of the wrong type
-      // (always i32 in our case). So instead we need to create export register
-      // with correct type manually.
-      // TODO: To eliminate this problem we can remove gc.result intrinsics
-      //       completely and make statepoint call to return a tuple.
+      SDValue ReturnValue = Merge->getOperand(0);
       unsigned Reg = FuncInfo.CreateRegs(RetTy);
       RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
                        DAG.getDataLayout(), Reg, RetTy,
@@ -903,16 +1011,7 @@
       RFV.getCopyToRegs(ReturnValue, DAG, getCurSDLoc(), Chain, nullptr);
       PendingExports.push_back(Chain);
       FuncInfo.ValueMap[&I] = Reg;
-    } else {
-      // Result value will be used in a same basic block. Don't export it or
-      // perform any explicit register copies.
-      // We'll replace the actuall call node shortly. gc_result will grab
-      // this value.
-      setValue(&I, ReturnValue);
     }
-  } else {
-    // The token value is never used from here on, just generate a poison value
-    setValue(&I, DAG.getIntPtrConstant(-1, getCurSDLoc()));
   }
 }
 
@@ -943,7 +1042,9 @@
 
   // NB! The GC arguments are deliberately left empty.
 
-  if (SDValue ReturnVal = LowerAsSTATEPOINT(SI)) {
+  auto Ret = LowerAsSTATEPOINT(SI);
+  assert(Ret->getOpcode() == ISD::MERGE_VALUES);
+  if (SDValue ReturnVal = Ret.getOperand(0)) {
     ReturnVal = lowerRangeToAssertZExt(DAG, *Call, ReturnVal);
     setValue(Call, ReturnVal);
   }
@@ -974,17 +1075,21 @@
     assert(CopyFromReg.getNode());
     setValue(&CI, CopyFromReg);
   } else {
-    setValue(&CI, getValue(I));
+    SDValue SD = getValue(I);
+    if (SD->getOpcode() == ISD::MERGE_VALUES)
+      SD = SD->getOperand(0);
+    setValue(&CI, SD);
   }
 }
 
 void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) {
+  const BasicBlock *StatepointBB = Relocate.getStatepoint()->getParent();
 #ifndef NDEBUG
   // Consistency check
   // We skip this check for relocates not in the same basic block as their
   // statepoint. It would be too expensive to preserve validation info through
   // different basic blocks.
-  if (Relocate.getStatepoint()->getParent() == Relocate.getParent())
+  if (StatepointBB == Relocate.getParent())
     StatepointLowering.relocCallVisited(Relocate);
 
   auto *Ty = Relocate.getType()->getScalarType();
@@ -1007,6 +1112,35 @@
   assert(SlotIt != SpillMap.end() && "Relocating not lowered gc value");
   Optional<int> DerivedPtrLocation = SlotIt->second;
 
+  auto &DPtrMap = FuncInfo.DerivedPtrMap[Relocate.getStatepoint()];
+  auto It = DPtrMap.find(Relocate.getDerivedPtr());
+  if (It != DPtrMap.end()) {
+    // This GC ptr is lowered through VReg.
+    unsigned Index = It->second;
+    SDValue Result;
+    auto &StatepointRegs = FuncInfo.StatepointRegs[Relocate.getStatepoint()];
+    if (StatepointBB != Relocate.getParent()) {
+      // Statepoint is in different basic block. Default getValue() mechanism
+      // does not work here, so we need create CopyFromRegs manually.
+      // See comment in LowerStatepoint for details.
+      assert(Index < StatepointRegs.size());
+      unsigned InReg = StatepointRegs[Index];
+      RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
+                       DAG.getDataLayout(), InReg, DerivedPtr->getType(),
+                       None); // This is not an ABI copy.
+      SDValue Chain = DAG.getEntryNode();
+      Result = RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr,
+                                   DerivedPtr);
+    } else {
+      SDNode *Statepoint = getValue(Relocate.getStatepoint()).getNode();
+      if (Statepoint->getOpcode() == ISD::MERGE_VALUES)
+        Statepoint = Statepoint->getOperand(1).getNode();
+      Result = SDValue(Statepoint, Index);
+    }
+    setValue(&Relocate, Result);
+    return;
+  }
+
   // We didn't need to spill these special cases (constants and allocas).
   // See the handling in spillIncomingValueForStatepoint for detail.
   if (!DerivedPtrLocation) {
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -471,6 +471,7 @@
                                     ArrayRef<unsigned> Ops, int FrameIndex,
                                     const TargetInstrInfo &TII) {
   unsigned StartIdx = 0;
+  unsigned NumDefs = 0;
   switch (MI.getOpcode()) {
   case TargetOpcode::STACKMAP: {
     // StackMapLiveValues are foldable
@@ -486,16 +487,28 @@
   case TargetOpcode::STATEPOINT: {
     // For statepoints, fold deopt and gc arguments, but not call arguments.
     StartIdx = StatepointOpers(&MI).getVarIdx();
+    NumDefs = MI.getNumDefs();
     break;
   }
   default:
     llvm_unreachable("unexpected stackmap opcode");
   }
 
+  unsigned DefToFoldIdx = MI.getNumOperands();
+
   // Return false if any operands requested for folding are not foldable (not
   // part of the stackmap's live values).
   for (unsigned Op : Ops) {
-    if (Op < StartIdx)
+    if (Op < NumDefs)
+      DefToFoldIdx = Op;
+    else if (Op < StartIdx)
+      return nullptr;
+    // When called from regalloc (InlineSpiller), operands must be untied,
+    // and regalloc will take care of (re)loading operand from memory.
+    // But when called from other places (e.g. peephole pass),
+    // we cannot fold operand which are tied - callers are unaware they
+    // need to reload destination register.
+    if (MI.getOperand(Op).isTied())
       return nullptr;
   }
 
@@ -505,11 +518,16 @@
 
   // No need to fold return, the meta data, and function arguments
   for (unsigned i = 0; i < StartIdx; ++i)
-    MIB.add(MI.getOperand(i));
+    if (i != DefToFoldIdx)
+      MIB.add(MI.getOperand(i));
 
-  for (unsigned i = StartIdx; i < MI.getNumOperands(); ++i) {
+  for (unsigned i = StartIdx, e = MI.getNumOperands(); i < e; ++i) {
     MachineOperand &MO = MI.getOperand(i);
+    unsigned TiedTo = e;
+    (void)MI.isRegTiedToDefOperand(i, &TiedTo);
+
     if (is_contained(Ops, i)) {
+      assert(TiedTo == e && "Cannot fold tied operands");
       unsigned SpillSize;
       unsigned SpillOffset;
       // Compute the spill slot size and offset.
@@ -523,9 +541,15 @@
       MIB.addImm(SpillSize);
       MIB.addFrameIndex(FrameIndex);
       MIB.addImm(SpillOffset);
-    }
-    else
+    } else {
       MIB.add(MO);
+      if (TiedTo < e) {
+        assert(TiedTo < NumDefs && "Bad tied operand");
+        if (TiedTo > DefToFoldIdx)
+          --TiedTo;
+        NewMI->tieOperands(TiedTo, NewMI->getNumOperands() - 1);
+      }
+    }
   }
   return NewMI;
 }
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1041,9 +1041,15 @@
   // Inherit previous memory operands.
   MIB.cloneMemRefs(*MI);
 
-  for (auto &MO : MI->operands()) {
+  for (unsigned i = 0; i < MI->getNumOperands(); ++i) {
+    MachineOperand &MO = MI->getOperand(i);
     if (!MO.isFI()) {
+      unsigned TiedTo = i;
+      if (MO.isReg() && MO.isTied())
+        TiedTo = MI->findTiedOperandIdx(i);
       MIB.add(MO);
+      if (TiedTo < i)
+        MIB->tieOperands(TiedTo, MIB->getNumOperands() - 1);
       continue;
     }
 
diff --git a/llvm/test/CodeGen/X86/statepoint-call-lowering.ll b/llvm/test/CodeGen/X86/statepoint-call-lowering.ll
--- a/llvm/test/CodeGen/X86/statepoint-call-lowering.ll
+++ b/llvm/test/CodeGen/X86/statepoint-call-lowering.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,CHECK-ALL %s
+; RUN: llc -verify-machineinstrs -use-registers-for-gcptrs=true < %s | FileCheck --check-prefixes=CHECK-VREG,CHECK-ALL %s
 ; This file contains a collection of basic tests to ensure we didn't
 ; screw up normal call lowering when there are no deopt or gc arguments.
 
@@ -16,15 +17,15 @@
 declare void @varargf(i32, ...)
 
 define i1 @test_i1_return() gc "statepoint-example" {
-; CHECK-LABEL: test_i1_return:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq return_i1
-; CHECK-NEXT:  .Ltmp0:
-; CHECK-NEXT:    popq %rcx
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+; CHECK-ALL-LABEL: test_i1_return:
+; CHECK-ALL:       # %bb.0: # %entry
+; CHECK-ALL-NEXT:    pushq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ALL-NEXT:    callq return_i1
+; CHECK-ALL-NEXT:  .Ltmp0:
+; CHECK-ALL-NEXT:    popq %rcx
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-ALL-NEXT:    retq
 ; This is just checking that a i1 gets lowered normally when there's no extra
 ; state arguments to the statepoint
 entry:
@@ -34,15 +35,15 @@
 }
 
 define i32 @test_i32_return() gc "statepoint-example" {
-; CHECK-LABEL: test_i32_return:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq return_i32
-; CHECK-NEXT:  .Ltmp1:
-; CHECK-NEXT:    popq %rcx
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+; CHECK-ALL-LABEL: test_i32_return:
+; CHECK-ALL:       # %bb.0: # %entry
+; CHECK-ALL-NEXT:    pushq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ALL-NEXT:    callq return_i32
+; CHECK-ALL-NEXT:  .Ltmp1:
+; CHECK-ALL-NEXT:    popq %rcx
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-ALL-NEXT:    retq
 entry:
   %safepoint_token = tail call token (i64, i32, i32 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32f(i64 0, i32 0, i32 ()* @return_i32, i32 0, i32 0, i32 0, i32 0)
   %call1 = call zeroext i32 @llvm.experimental.gc.result.i32(token %safepoint_token)
@@ -50,15 +51,15 @@
 }
 
 define i32* @test_i32ptr_return() gc "statepoint-example" {
-; CHECK-LABEL: test_i32ptr_return:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq return_i32ptr
-; CHECK-NEXT:  .Ltmp2:
-; CHECK-NEXT:    popq %rcx
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+; CHECK-ALL-LABEL: test_i32ptr_return:
+; CHECK-ALL:       # %bb.0: # %entry
+; CHECK-ALL-NEXT:    pushq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ALL-NEXT:    callq return_i32ptr
+; CHECK-ALL-NEXT:  .Ltmp2:
+; CHECK-ALL-NEXT:    popq %rcx
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-ALL-NEXT:    retq
 entry:
   %safepoint_token = tail call token (i64, i32, i32* ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_p0i32f(i64 0, i32 0, i32* ()* @return_i32ptr, i32 0, i32 0, i32 0, i32 0)
   %call1 = call i32* @llvm.experimental.gc.result.p0i32(token %safepoint_token)
@@ -66,15 +67,15 @@
 }
 
 define float @test_float_return() gc "statepoint-example" {
-; CHECK-LABEL: test_float_return:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq return_float
-; CHECK-NEXT:  .Ltmp3:
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+; CHECK-ALL-LABEL: test_float_return:
+; CHECK-ALL:       # %bb.0: # %entry
+; CHECK-ALL-NEXT:    pushq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ALL-NEXT:    callq return_float
+; CHECK-ALL-NEXT:  .Ltmp3:
+; CHECK-ALL-NEXT:    popq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-ALL-NEXT:    retq
 entry:
   %safepoint_token = tail call token (i64, i32, float ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_f32f(i64 0, i32 0, float ()* @return_float, i32 0, i32 0, i32 0, i32 0)
   %call1 = call float @llvm.experimental.gc.result.f32(token %safepoint_token)
@@ -82,15 +83,15 @@
 }
 
 define %struct @test_struct_return() gc "statepoint-example" {
-; CHECK-LABEL: test_struct_return:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq return_struct
-; CHECK-NEXT:  .Ltmp4:
-; CHECK-NEXT:    popq %rcx
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+; CHECK-ALL-LABEL: test_struct_return:
+; CHECK-ALL:       # %bb.0: # %entry
+; CHECK-ALL-NEXT:    pushq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ALL-NEXT:    callq return_struct
+; CHECK-ALL-NEXT:  .Ltmp4:
+; CHECK-ALL-NEXT:    popq %rcx
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-ALL-NEXT:    retq
 entry:
   %safepoint_token = tail call token (i64, i32, %struct ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_structf(i64 0, i32 0, %struct ()* @return_struct, i32 0, i32 0, i32 0, i32 0)
   %call1 = call %struct @llvm.experimental.gc.result.struct(token %safepoint_token)
@@ -108,6 +109,22 @@
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
+; CHECK-VREG-LABEL: test_relocate:
+; CHECK-VREG:       # %bb.0: # %entry
+; CHECK-VREG-NEXT:    pushq %rbx
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-VREG-NEXT:    subq $16, %rsp
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-VREG-NEXT:    .cfi_offset %rbx, -16
+; CHECK-VREG-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p)
+; CHECK-VREG-NEXT:    callq return_i1
+; CHECK-VREG-NEXT:  .Ltmp5:
+; CHECK-VREG-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx
+; CHECK-VREG-NEXT:    addq $16, %rsp
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-VREG-NEXT:    popq %rbx
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-VREG-NEXT:    retq
 ; Check that an ununsed relocate has no code-generation impact
 entry:
   %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0) ["gc-live" (i32 addrspace(1)* %a)]
@@ -117,17 +134,17 @@
 }
 
 define void @test_void_vararg() gc "statepoint-example" {
-; CHECK-LABEL: test_void_vararg:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movl $42, %edi
-; CHECK-NEXT:    movl $43, %esi
-; CHECK-NEXT:    callq varargf
-; CHECK-NEXT:  .Ltmp6:
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+; CHECK-ALL-LABEL: test_void_vararg:
+; CHECK-ALL:       # %bb.0: # %entry
+; CHECK-ALL-NEXT:    pushq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ALL-NEXT:    movl $42, %edi
+; CHECK-ALL-NEXT:    movl $43, %esi
+; CHECK-ALL-NEXT:    callq varargf
+; CHECK-ALL-NEXT:  .Ltmp6:
+; CHECK-ALL-NEXT:    popq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-ALL-NEXT:    retq
 ; Check a statepoint wrapping a *void* returning vararg function works
 entry:
   %safepoint_token = tail call token (i64, i32, void (i32, ...)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidi32varargf(i64 0, i32 0, void (i32, ...)* @varargf, i32 2, i32 0, i32 42, i32 43, i32 0, i32 0)
@@ -137,15 +154,15 @@
 }
 
 define i1 @test_i1_return_patchable() gc "statepoint-example" {
-; CHECK-LABEL: test_i1_return_patchable:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    nopl (%rax)
-; CHECK-NEXT:  .Ltmp7:
-; CHECK-NEXT:    popq %rcx
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+; CHECK-ALL-LABEL: test_i1_return_patchable:
+; CHECK-ALL:       # %bb.0: # %entry
+; CHECK-ALL-NEXT:    pushq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ALL-NEXT:    nopl (%rax)
+; CHECK-ALL-NEXT:  .Ltmp7:
+; CHECK-ALL-NEXT:    popq %rcx
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-ALL-NEXT:    retq
 ; A patchable variant of test_i1_return
 entry:
   %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 3, i1 ()*null, i32 0, i32 0, i32 0, i32 0)
@@ -188,6 +205,44 @@
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
+; CHECK-VREG-LABEL: test_cross_bb:
+; CHECK-VREG:       # %bb.0: # %entry
+; CHECK-VREG-NEXT:    pushq %rbp
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-VREG-NEXT:    pushq %r14
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-VREG-NEXT:    pushq %rbx
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-VREG-NEXT:    subq $16, %rsp
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-VREG-NEXT:    .cfi_offset %rbx, -32
+; CHECK-VREG-NEXT:    .cfi_offset %r14, -24
+; CHECK-VREG-NEXT:    .cfi_offset %rbp, -16
+; CHECK-VREG-NEXT:    movl %esi, %ebp
+; CHECK-VREG-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p)
+; CHECK-VREG-NEXT:    callq return_i1
+; CHECK-VREG-NEXT:  .Ltmp8:
+; CHECK-VREG-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx
+; CHECK-VREG-NEXT:    testb $1, %bpl
+; CHECK-VREG-NEXT:    je .LBB8_2
+; CHECK-VREG-NEXT:  # %bb.1: # %left
+; CHECK-VREG-NEXT:    movl %eax, %r14d
+; CHECK-VREG-NEXT:    movq %rbx, %rdi
+; CHECK-VREG-NEXT:    callq consume
+; CHECK-VREG-NEXT:    movl %r14d, %eax
+; CHECK-VREG-NEXT:    jmp .LBB8_3
+; CHECK-VREG-NEXT:  .LBB8_2: # %right
+; CHECK-VREG-NEXT:    movb $1, %al
+; CHECK-VREG-NEXT:  .LBB8_3: # %right
+; CHECK-VREG-NEXT:    addq $16, %rsp
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-VREG-NEXT:    popq %rbx
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-VREG-NEXT:    popq %r14
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-VREG-NEXT:    popq %rbp
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-VREG-NEXT:    retq
 entry:
   %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0) ["gc-live" (i32 addrspace(1)* %a)]
   br i1 %external_cond, label %left, label %right
@@ -207,31 +262,31 @@
 declare void @consume_attributes(i32, i8* nest, i32, %struct2* byval)
 
 define void @test_attributes(%struct2* byval %s) gc "statepoint-example" {
-; CHECK-LABEL: test_attributes:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    subq $8, %rsp
-; CHECK-NEXT:    .cfi_adjust_cfa_offset 8
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; CHECK-NEXT:    movl $42, %edi
-; CHECK-NEXT:    xorl %r10d, %r10d
-; CHECK-NEXT:    movl $17, %esi
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    .cfi_adjust_cfa_offset 8
-; CHECK-NEXT:    pushq %rdx
-; CHECK-NEXT:    .cfi_adjust_cfa_offset 8
-; CHECK-NEXT:    pushq %rcx
-; CHECK-NEXT:    .cfi_adjust_cfa_offset 8
-; CHECK-NEXT:    callq consume_attributes
-; CHECK-NEXT:  .Ltmp9:
-; CHECK-NEXT:    addq $32, %rsp
-; CHECK-NEXT:    .cfi_adjust_cfa_offset -32
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+; CHECK-ALL-LABEL: test_attributes:
+; CHECK-ALL:       # %bb.0: # %entry
+; CHECK-ALL-NEXT:    pushq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ALL-NEXT:    subq $8, %rsp
+; CHECK-ALL-NEXT:    .cfi_adjust_cfa_offset 8
+; CHECK-ALL-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; CHECK-ALL-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; CHECK-ALL-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; CHECK-ALL-NEXT:    movl $42, %edi
+; CHECK-ALL-NEXT:    xorl %r10d, %r10d
+; CHECK-ALL-NEXT:    movl $17, %esi
+; CHECK-ALL-NEXT:    pushq %rax
+; CHECK-ALL-NEXT:    .cfi_adjust_cfa_offset 8
+; CHECK-ALL-NEXT:    pushq %rdx
+; CHECK-ALL-NEXT:    .cfi_adjust_cfa_offset 8
+; CHECK-ALL-NEXT:    pushq %rcx
+; CHECK-ALL-NEXT:    .cfi_adjust_cfa_offset 8
+; CHECK-ALL-NEXT:    callq consume_attributes
+; CHECK-ALL-NEXT:  .Ltmp9:
+; CHECK-ALL-NEXT:    addq $32, %rsp
+; CHECK-ALL-NEXT:    .cfi_adjust_cfa_offset -32
+; CHECK-ALL-NEXT:    popq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-ALL-NEXT:    retq
 entry:
 ; Check that arguments with attributes are lowered correctly.
 ; We call a function that has a nest argument and a byval argument.
diff --git a/llvm/test/CodeGen/X86/statepoint-duplicates-export.ll b/llvm/test/CodeGen/X86/statepoint-duplicates-export.ll
--- a/llvm/test/CodeGen/X86/statepoint-duplicates-export.ll
+++ b/llvm/test/CodeGen/X86/statepoint-duplicates-export.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc  -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc  -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,CHECK-ALL %s
+; RUN: llc  -verify-machineinstrs -use-registers-for-gcptrs=true < %s | FileCheck --check-prefixes=CHECK-VREG,CHECK-ALL %s
 
 ; Check that we can export values of "duplicated" gc.relocates without a crash
 ; "duplicate" here means maps to same SDValue.  We previously had an
@@ -12,18 +13,18 @@
 declare void @func()
 
 define i1 @test() gc "statepoint-example" {
-; CHECK-LABEL: test:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    callq func
-; CHECK-NEXT:  .Ltmp0:
-; CHECK-NEXT:    callq func
-; CHECK-NEXT:  .Ltmp1:
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    popq %rcx
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+; CHECK-ALL-LABEL: test:
+; CHECK-ALL:       # %bb.0: # %entry
+; CHECK-ALL-NEXT:    pushq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ALL-NEXT:    callq func
+; CHECK-ALL-NEXT:  .Ltmp0:
+; CHECK-ALL-NEXT:    callq func
+; CHECK-ALL-NEXT:  .Ltmp1:
+; CHECK-ALL-NEXT:    movb $1, %al
+; CHECK-ALL-NEXT:    popq %rcx
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-ALL-NEXT:    retq
 entry:
   %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @func, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* null, i32 addrspace(1)* null)
   %base = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token,  i32 7, i32 7)
diff --git a/llvm/test/CodeGen/X86/statepoint-invoke.ll b/llvm/test/CodeGen/X86/statepoint-invoke.ll
--- a/llvm/test/CodeGen/X86/statepoint-invoke.ll
+++ b/llvm/test/CodeGen/X86/statepoint-invoke.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -verify-machineinstrs < %s 2>&1 | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s 2>&1 | FileCheck --check-prefixes=CHECK,CHECK-ALL %s
+; RUN: llc -verify-machineinstrs -use-registers-for-gcptrs=true < %s | FileCheck --check-prefixes=CHECK-VREG,CHECK-ALL %s
 
 target triple = "x86_64-pc-linux-gnu"
 
@@ -31,6 +32,41 @@
 ; CHECK-NEXT:    addq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
+; CHECK-VREG-LABEL: test_basic:
+; CHECK-VREG:       # %bb.0: # %entry
+; CHECK-VREG-NEXT:    pushq %r14
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-VREG-NEXT:    pushq %rbx
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-VREG-NEXT:    subq $24, %rsp
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-VREG-NEXT:    .cfi_offset %rbx, -24
+; CHECK-VREG-NEXT:    .cfi_offset %r14, -16
+; CHECK-VREG-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p)
+; CHECK-VREG-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p)
+; CHECK-VREG-NEXT:  .Ltmp0:
+; CHECK-VREG-NEXT:    callq some_call
+; CHECK-VREG-NEXT:  .Ltmp3:
+; CHECK-VREG-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx
+; CHECK-VREG-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14
+; CHECK-VREG-NEXT:  .Ltmp1:
+; CHECK-VREG-NEXT:  # %bb.1: # %normal_return
+; CHECK-VREG-NEXT:    movq %rbx, %rax
+; CHECK-VREG-NEXT:  .LBB0_2: # %normal_return
+; CHECK-VREG-NEXT:    addq $24, %rsp
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-VREG-NEXT:    popq %rbx
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-VREG-NEXT:    popq %r14
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-VREG-NEXT:    retq
+; CHECK-VREG-NEXT:  .LBB0_3: # %exceptional_return
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-VREG-NEXT:  .Ltmp2:
+; CHECK-VREG-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx
+; CHECK-VREG-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14
+; CHECK-VREG-NEXT:    movq %r14, %rax
+; CHECK-VREG-NEXT:    jmp .LBB0_2
                                      i64 addrspace(1)* %obj1)
 gc "statepoint-example" personality i32 ()* @"personality_function" {
 entry:
@@ -52,11 +88,11 @@
   %obj1.relocated1 = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %landing_pad, i32 1, i32 1)
   ret i64 addrspace(1)* %obj1.relocated1
 }
-; CHECK-LABEL: GCC_except_table{{[0-9]+}}:
-; CHECK: .uleb128  .Ltmp{{[0-9]+}}-.Ltmp{{[0-9]+}}
-; CHECK: .uleb128  .Ltmp{{[0-9]+}}-.Lfunc_begin{{[0-9]+}}
-; CHECK: .byte  0
-; CHECK: .p2align 4
+; CHECK-ALL-LABEL: GCC_except_table{{[0-9]+}}:
+; CHECK-ALL: .uleb128  .Ltmp{{[0-9]+}}-.Ltmp{{[0-9]+}}
+; CHECK-ALL: .uleb128  .Ltmp{{[0-9]+}}-.Lfunc_begin{{[0-9]+}}
+; CHECK-ALL: .byte  0
+; CHECK-ALL: .p2align 4
 
 define i64 addrspace(1)* @test_result(i64 addrspace(1)* %obj,
 ; CHECK-LABEL: test_result:
@@ -79,6 +115,31 @@
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
+; CHECK-VREG-LABEL: test_result:
+; CHECK-VREG:       # %bb.0: # %entry
+; CHECK-VREG-NEXT:    pushq %rbx
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-VREG-NEXT:    subq $16, %rsp
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-VREG-NEXT:    .cfi_offset %rbx, -16
+; CHECK-VREG-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p)
+; CHECK-VREG-NEXT:  .Ltmp4:
+; CHECK-VREG-NEXT:    callq some_other_call
+; CHECK-VREG-NEXT:  .Ltmp7:
+; CHECK-VREG-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx
+; CHECK-VREG-NEXT:  .Ltmp5:
+; CHECK-VREG-NEXT:  .LBB1_1: # %normal_return
+; CHECK-VREG-NEXT:    addq $16, %rsp
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-VREG-NEXT:    popq %rbx
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-VREG-NEXT:    retq
+; CHECK-VREG-NEXT:  .LBB1_2: # %exceptional_return
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-VREG-NEXT:  .Ltmp6:
+; CHECK-VREG-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx
+; CHECK-VREG-NEXT:    movq %rbx, %rax
+; CHECK-VREG-NEXT:    jmp .LBB1_1
                                       i64 addrspace(1)* %obj1)
   gc "statepoint-example" personality i32 ()* @personality_function {
 entry:
@@ -95,11 +156,11 @@
   %obj.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %landing_pad, i32 0, i32 0)
   ret i64 addrspace(1)* %obj.relocated
 }
-; CHECK-LABEL: GCC_except_table{{[0-9]+}}:
-; CHECK: .uleb128 .Ltmp{{[0-9]+}}-.Ltmp{{[0-9]+}}
-; CHECK: .uleb128 .Ltmp{{[0-9]+}}-.Lfunc_begin{{[0-9]+}}
-; CHECK: .byte 0
-; CHECK: .p2align 4
+; CHECK-ALL-LABEL: GCC_except_table{{[0-9]+}}:
+; CHECK-ALL: .uleb128 .Ltmp{{[0-9]+}}-.Ltmp{{[0-9]+}}
+; CHECK-ALL: .uleb128 .Ltmp{{[0-9]+}}-.Lfunc_begin{{[0-9]+}}
+; CHECK-ALL: .byte 0
+; CHECK-ALL: .p2align 4
 
 define i64 addrspace(1)* @test_same_val(i1 %cond, i64 addrspace(1)* %val1, i64 addrspace(1)* %val2, i64 addrspace(1)* %val3)
 ; CHECK-LABEL: test_same_val:
@@ -153,6 +214,80 @@
 ; CHECK-NEXT:  .Ltmp13:
 ; CHECK-NEXT:    movq (%rsp), %rax
 ; CHECK-NEXT:    jmp .LBB2_6
+; CHECK-VREG-LABEL: test_same_val:
+; CHECK-VREG:       # %bb.0: # %entry
+; CHECK-VREG-NEXT:    pushq %rbp
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-VREG-NEXT:    pushq %r15
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-VREG-NEXT:    pushq %r14
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-VREG-NEXT:    pushq %rbx
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-VREG-NEXT:    subq $24, %rsp
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-VREG-NEXT:    .cfi_offset %rbx, -40
+; CHECK-VREG-NEXT:    .cfi_offset %r14, -32
+; CHECK-VREG-NEXT:    .cfi_offset %r15, -24
+; CHECK-VREG-NEXT:    .cfi_offset %rbp, -16
+; CHECK-VREG-NEXT:    movq %rdx, %rbx
+; CHECK-VREG-NEXT:    movq %rsi, %rbp
+; CHECK-VREG-NEXT:    movl %edi, %r14d
+; CHECK-VREG-NEXT:    testb $1, %r14b
+; CHECK-VREG-NEXT:    je .LBB2_2
+; CHECK-VREG-NEXT:  # %bb.1: # %left
+; CHECK-VREG-NEXT:  .Ltmp11:
+; CHECK-VREG-NEXT:    movq %rbp, %rdi
+; CHECK-VREG-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p)
+; CHECK-VREG-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p)
+; CHECK-VREG-NEXT:    callq some_call
+; CHECK-VREG-NEXT:  .Ltmp14:
+; CHECK-VREG-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp
+; CHECK-VREG-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx
+; CHECK-VREG-NEXT:  .Ltmp12:
+; CHECK-VREG-NEXT:    jmp .LBB2_4
+; CHECK-VREG-NEXT:  .LBB2_2: # %right
+; CHECK-VREG-NEXT:    movq %rcx, %r15
+; CHECK-VREG-NEXT:  .Ltmp8:
+; CHECK-VREG-NEXT:    movq %rbp, %rdi
+; CHECK-VREG-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p)
+; CHECK-VREG-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p)
+; CHECK-VREG-NEXT:    callq some_call
+; CHECK-VREG-NEXT:  .Ltmp15:
+; CHECK-VREG-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx
+; CHECK-VREG-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15
+; CHECK-VREG-NEXT:  .Ltmp9:
+; CHECK-VREG-NEXT:  # %bb.3: # %right.relocs
+; CHECK-VREG-NEXT:    movq %r15, %rbp
+; CHECK-VREG-NEXT:  .LBB2_4: # %normal_return
+; CHECK-VREG-NEXT:    testb $1, %r14b
+; CHECK-VREG-NEXT:    cmoveq %rbx, %rbp
+; CHECK-VREG-NEXT:  .LBB2_5: # %normal_return
+; CHECK-VREG-NEXT:    movq %rbp, %rax
+; CHECK-VREG-NEXT:  .LBB2_6: # %normal_return
+; CHECK-VREG-NEXT:    addq $24, %rsp
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-VREG-NEXT:    popq %rbx
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-VREG-NEXT:    popq %r14
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-VREG-NEXT:    popq %r15
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-VREG-NEXT:    popq %rbp
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-VREG-NEXT:    retq
+; CHECK-VREG-NEXT:  .LBB2_8: # %exceptional_return.right
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-VREG-NEXT:  .Ltmp10:
+; CHECK-VREG-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx
+; CHECK-VREG-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15
+; CHECK-VREG-NEXT:    movq %rbx, %rax
+; CHECK-VREG-NEXT:    jmp .LBB2_6
+; CHECK-VREG-NEXT:  .LBB2_7: # %exceptional_return.left
+; CHECK-VREG-NEXT:  .Ltmp13:
+; CHECK-VREG-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp
+; CHECK-VREG-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx
+; CHECK-VREG-NEXT:    jmp .LBB2_5
   gc "statepoint-example" personality i32 ()* @"personality_function" {
 entry:
   br i1 %cond, label %left, label %right
@@ -195,23 +330,23 @@
 }
 
 define i64 addrspace(1)* @test_null_undef(i64 addrspace(1)* %val1)
-; CHECK-LABEL: test_null_undef:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:  .Ltmp16:
-; CHECK-NEXT:    callq some_call
-; CHECK-NEXT:  .Ltmp19:
-; CHECK-NEXT:  .Ltmp17:
-; CHECK-NEXT:  .LBB3_1: # %normal_return
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    popq %rcx
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB3_2: # %exceptional_return
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:  .Ltmp18:
-; CHECK-NEXT:    jmp .LBB3_1
+; CHECK-ALL-LABEL: test_null_undef:
+; CHECK-ALL:       # %bb.0: # %entry
+; CHECK-ALL-NEXT:    pushq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ALL-NEXT:  .Ltmp16:
+; CHECK-ALL-NEXT:    callq some_call
+; CHECK-ALL-NEXT:  .Ltmp19:
+; CHECK-ALL-NEXT:  .Ltmp17:
+; CHECK-ALL-NEXT:  .LBB3_1: # %normal_return
+; CHECK-ALL-NEXT:    xorl %eax, %eax
+; CHECK-ALL-NEXT:    popq %rcx
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-ALL-NEXT:    retq
+; CHECK-ALL-NEXT:  .LBB3_2: # %exceptional_return
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ALL-NEXT:  .Ltmp18:
+; CHECK-ALL-NEXT:    jmp .LBB3_1
        gc "statepoint-example" personality i32 ()* @"personality_function" {
 entry:
   %sp1 = invoke token (i64, i32, void (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp1i64f(i64 0, i32 0, void (i64 addrspace(1)*)* @some_call, i32 1, i32 0, i64 addrspace(1)* %val1, i32 0, i32 0) ["gc-live"(i64 addrspace(1)* null, i64 addrspace(1)* undef)]
@@ -231,26 +366,26 @@
 }
 
 define i64 addrspace(1)* @test_alloca_and_const(i64 addrspace(1)* %val1)
-; CHECK-LABEL: test_alloca_and_const:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:  .Ltmp20:
-; CHECK-NEXT:    callq some_call
-; CHECK-NEXT:  .Ltmp23:
-; CHECK-NEXT:  .Ltmp21:
-; CHECK-NEXT:  # %bb.1: # %normal_return
-; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    popq %rcx
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB4_2: # %exceptional_return
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:  .Ltmp22:
-; CHECK-NEXT:    movl $15, %eax
-; CHECK-NEXT:    popq %rcx
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+; CHECK-ALL-LABEL: test_alloca_and_const:
+; CHECK-ALL:       # %bb.0: # %entry
+; CHECK-ALL-NEXT:    pushq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ALL-NEXT:  .Ltmp20:
+; CHECK-ALL-NEXT:    callq some_call
+; CHECK-ALL-NEXT:  .Ltmp23:
+; CHECK-ALL-NEXT:  .Ltmp21:
+; CHECK-ALL-NEXT:  # %bb.1: # %normal_return
+; CHECK-ALL-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
+; CHECK-ALL-NEXT:    popq %rcx
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-ALL-NEXT:    retq
+; CHECK-ALL-NEXT:  .LBB4_2: # %exceptional_return
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ALL-NEXT:  .Ltmp22:
+; CHECK-ALL-NEXT:    movl $15, %eax
+; CHECK-ALL-NEXT:    popq %rcx
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-ALL-NEXT:    retq
        gc "statepoint-example" personality i32 ()* @"personality_function" {
 entry:
   %a = alloca i32
diff --git a/llvm/test/CodeGen/X86/statepoint-no-extra-const.ll b/llvm/test/CodeGen/X86/statepoint-no-extra-const.ll
--- a/llvm/test/CodeGen/X86/statepoint-no-extra-const.ll
+++ b/llvm/test/CodeGen/X86/statepoint-no-extra-const.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -use-registers-for-gcptrs=true | FileCheck --check-prefix=CHECK-VREG %s
 
 define i8 addrspace(1)* @no_extra_const(i8 addrspace(1)* %obj) gc "statepoint-example" {
 ; CHECK-LABEL:   no_extra_const:
@@ -13,6 +14,23 @@
 ; CHECK-NEXT:    popq	%rcx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
+; CHECK-VREG-LABEL: no_extra_const:
+; CHECK-VREG:       # %bb.0: # %entry
+; CHECK-VREG-NEXT:    pushq %rbx
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-VREG-NEXT:    subq $16, %rsp
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-VREG-NEXT:    .cfi_offset %rbx, -16
+; CHECK-VREG-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p)
+; CHECK-VREG-NEXT:    nopl 8(%rax)
+; CHECK-VREG-NEXT:  .Ltmp0:
+; CHECK-VREG-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx
+; CHECK-VREG-NEXT:    movq %rbx, %rax
+; CHECK-VREG-NEXT:    addq $16, %rsp
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-VREG-NEXT:    popq %rbx
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-VREG-NEXT:    retq
 entry:
   %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 4, void ()* null, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %obj)
   %obj.relocated = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %safepoint_token, i32 7, i32 7) ; (%obj, %obj)
diff --git a/llvm/test/CodeGen/X86/statepoint-regs.ll b/llvm/test/CodeGen/X86/statepoint-regs.ll
--- a/llvm/test/CodeGen/X86/statepoint-regs.ll
+++ b/llvm/test/CodeGen/X86/statepoint-regs.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -verify-machineinstrs -O3 -use-registers-for-deopt-values -restrict-statepoint-remat=true < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -O3 -use-registers-for-deopt-values -restrict-statepoint-remat=true < %s | FileCheck --check-prefixes=CHECK,CHECK-SPILL %s
+; RUN: llc -verify-machineinstrs -O3 -use-registers-for-deopt-values -restrict-statepoint-remat=true -use-registers-for-gcptrs=true < %s | FileCheck --check-prefixes=CHECK,CHECK-VREG %s
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.11.0"
 
@@ -97,23 +98,49 @@
 
 ; A gc-value must be spilled even if it is also a deopt value.
 define  i32 addrspace(1)* @test5(i32 %a, i32 addrspace(1)* %p) gc "statepoint-example" {
-; CHECK-LABEL: test5:
-; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    subq $16, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset %rbx, -16
-; CHECK-NEXT:    movl %edi, %ebx
-; CHECK-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    callq _bar
-; CHECK-NEXT:  Ltmp5:
-; CHECK-NEXT:    callq _bar
-; CHECK-NEXT:  Ltmp6:
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    addq $16, %rsp
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    retq
+; CHECK-SPILL-LABEL: test5:
+; CHECK-SPILL:       ## %bb.0: ## %entry
+; CHECK-SPILL-NEXT:    pushq %rbx
+; CHECK-SPILL-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SPILL-NEXT:    subq $16, %rsp
+; CHECK-SPILL-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SPILL-NEXT:    .cfi_offset %rbx, -16
+; CHECK-SPILL-NEXT:    movl %edi, %ebx
+; CHECK-SPILL-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
+; CHECK-SPILL-NEXT:    callq _bar
+; CHECK-SPILL-NEXT:  Ltmp5:
+; CHECK-SPILL-NEXT:    callq _bar
+; CHECK-SPILL-NEXT:  Ltmp6:
+; CHECK-SPILL-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; CHECK-SPILL-NEXT:    addq $16, %rsp
+; CHECK-SPILL-NEXT:    popq %rbx
+; CHECK-SPILL-NEXT:    retq
+;
+; CHECK-VREG-LABEL: test5:
+; CHECK-VREG:       ## %bb.0: ## %entry
+; CHECK-VREG-NEXT:    pushq %rbp
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-VREG-NEXT:    pushq %rbx
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-VREG-NEXT:    pushq %rax
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-VREG-NEXT:    .cfi_offset %rbx, -24
+; CHECK-VREG-NEXT:    .cfi_offset %rbp, -16
+; CHECK-VREG-NEXT:    movq %rsi, (%rsp)
+; CHECK-VREG-NEXT:    movl %edi, %ebp
+; CHECK-VREG-NEXT:    callq _bar
+; CHECK-VREG-NEXT:  Ltmp5:
+; CHECK-VREG-NEXT:    movq (%rsp), %rbx
+; CHECK-VREG-NEXT:    movq %rbx, (%rsp)
+; CHECK-VREG-NEXT:    callq _bar
+; CHECK-VREG-NEXT:  Ltmp6:
+; CHECK-VREG-NEXT:    movq (%rsp), %rbx
+; CHECK-VREG-NEXT:    movq %rbx, %rax
+; CHECK-VREG-NEXT:    addq $8, %rsp
+; CHECK-VREG-NEXT:    popq %rbx
+; CHECK-VREG-NEXT:    popq %rbp
+; CHECK-VREG-NEXT:    retq
+
 entry:
   %token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %p, i32 addrspace(1)* %p) ["deopt"(i32 %a)]
   %p2 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %token,  i32 8, i32 8)
@@ -672,30 +699,27 @@
   ret void
 }
 
-define i32 addrspace(1)*  @test_fpconst_deopt(i32 addrspace(1)* %in) gc "statepoint-example" {
+define void @test_fpconst_deopt(i32 addrspace(1)* %in) gc "statepoint-example" {
 ; CHECK-LABEL: test_fpconst_deopt:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movq %rdi, (%rsp)
 ; CHECK-NEXT:    nopl 8(%rax,%rax)
 ; CHECK-NEXT:  Ltmp18:
-; CHECK-NEXT:    movq (%rsp), %rax
-; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
-    %statepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2, i32 5, void ()* nonnull @bar, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %in) ["deopt" (
+    %statepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2, i32 5, void ()* nonnull @bar, i32 0, i32 0, i32 0, i32 0) ["deopt" (
     float 0x40421A1CA0000000, float 0x40459A1CA0000000, float 0x40401A1CA0000000, float 0x40479A1CA0000000, float 0x403C343940000000,
     float 0x403E343940000000, float 0x40469A1CA0000000, float 0x40489A1CA0000000, float 0x404A9A1CA0000000, float 0x40499A1CA0000000,
     float 0xC05FCD2F20000000, float 0xC05C0D2F20000000, float 0xC060269780000000, float 0xC05B8D2F20000000, float 0xC060669780000000,
     float 0xC05B0D2F20000000, float 0xC060A69780000000, float 0xC05A8D2F20000000, float 0xC060E69780000000, float 0x40439A1CA0000000)]
-    %out = call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %statepoint_token, i32 7, i32 7)
-    ret i32 addrspace(1)* %out
+    ret void
 }
 
 ; CHECK-LABEL: __LLVM_StackMaps:
 ; CHECK: .long   Ltmp18-_test_fpconst_deopt
 ; CHECK-NEXT: .short	0
-; CHECK-NEXT: .short	25
+; CHECK-NEXT: .short	23
 ; CHECK-NEXT: .byte	4
 ; CHECK-NEXT: .byte	0
 ; CHECK-NEXT: .short	8
diff --git a/llvm/test/CodeGen/X86/statepoint-uniqueing.ll b/llvm/test/CodeGen/X86/statepoint-uniqueing.ll
--- a/llvm/test/CodeGen/X86/statepoint-uniqueing.ll
+++ b/llvm/test/CodeGen/X86/statepoint-uniqueing.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK-SPILL,CHECK %s
+; RUN: llc -verify-machineinstrs -use-registers-for-gcptrs=true < %s | FileCheck --check-prefixes=CHECK-VREG,CHECK %s
 
 target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-pc-linux-gnu"
@@ -12,20 +13,41 @@
 
 ;; Two gc.relocates of the same input, should require only a single spill/fill
 define void @test_gcrelocate_uniqueing(i32 addrspace(1)* %ptr) gc "statepoint-example" {
-; CHECK-LABEL: test_gcrelocate_uniqueing:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movq %rdi, (%rsp)
-; CHECK-NEXT:    callq f
-; CHECK-NEXT:  .Ltmp0:
-; CHECK-NEXT:    movq (%rsp), %rdi
-; CHECK-NEXT:    movq %rdi, %rsi
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    callq use
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+; CHECK-ALL-LABEL: test_gcrelocate_uniqueing:
+; CHECK-ALL:       # %bb.0:
+; CHECK-ALL-NEXT:    pushq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ALL-NEXT:    movq %rdi, (%rsp)
+; CHECK-ALL-NEXT:    callq f
+; CHECK-ALL-NEXT:  .Ltmp0:
+; CHECK-ALL-NEXT:    movq (%rsp), %rdi
+; CHECK-ALL-NEXT:    movq %rdi, %rsi
+; CHECK-ALL-NEXT:    xorl %eax, %eax
+; CHECK-ALL-NEXT:    callq use
+; CHECK-ALL-NEXT:    popq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-ALL-NEXT:    retq
+
+; CHECK-VREG-LABEL: test_gcrelocate_uniqueing:
+; CHECK-VREG:       # %bb.0:
+; CHECK-VREG-NEXT:    pushq %rbx
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-VREG-NEXT:    subq $16, %rsp
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-VREG-NEXT:    .cfi_offset %rbx, -16
+; CHECK-VREG-NEXT:    movq %rdi, 8(%rsp)
+; CHECK-VREG-NEXT:    callq f
+; CHECK-VREG-NEXT:  .Ltmp0:
+; CHECK-VREG-NEXT:    movq 8(%rsp), %rbx
+; CHECK-VREG-NEXT:    movq %rbx, %rdi
+; CHECK-VREG-NEXT:    movq %rbx, %rsi
+; CHECK-VREG-NEXT:    xorl %eax, %eax
+; CHECK-VREG-NEXT:    callq use
+; CHECK-VREG-NEXT:    addq $16, %rsp
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-VREG-NEXT:    popq %rbx
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-VREG-NEXT:    retq
   %tok = tail call token (i64, i32, void ()*, i32, i32, ...)
       @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @f, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %ptr, i32 addrspace(1)* %ptr) ["deopt" (i32 addrspace(1)* %ptr, i32 undef)]
   %a = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %tok, i32 7, i32 7)
@@ -36,20 +58,41 @@
 
 ;; Two gc.relocates of a bitcasted pointer should only require a single spill/fill
 define void @test_gcptr_uniqueing(i32 addrspace(1)* %ptr) gc "statepoint-example" {
-; CHECK-LABEL: test_gcptr_uniqueing:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movq %rdi, (%rsp)
-; CHECK-NEXT:    callq f
-; CHECK-NEXT:  .Ltmp1:
-; CHECK-NEXT:    movq (%rsp), %rdi
-; CHECK-NEXT:    movq %rdi, %rsi
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    callq use
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+; CHECK-ALL-LABEL: test_gcptr_uniqueing:
+; CHECK-ALL:       # %bb.0:
+; CHECK-ALL-NEXT:    pushq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ALL-NEXT:    movq %rdi, (%rsp)
+; CHECK-ALL-NEXT:    callq f
+; CHECK-ALL-NEXT:  .Ltmp1:
+; CHECK-ALL-NEXT:    movq (%rsp), %rdi
+; CHECK-ALL-NEXT:    movq %rdi, %rsi
+; CHECK-ALL-NEXT:    xorl %eax, %eax
+; CHECK-ALL-NEXT:    callq use
+; CHECK-ALL-NEXT:    popq %rax
+; CHECK-ALL-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-ALL-NEXT:    retq
+
+; CHECK-VREG-LABEL: test_gcptr_uniqueing:
+; CHECK-VREG:       # %bb.0:
+; CHECK-VREG-NEXT:    pushq %rbx
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-VREG-NEXT:    subq $16, %rsp
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-VREG-NEXT:    .cfi_offset %rbx, -16
+; CHECK-VREG-NEXT:    movq %rdi, 8(%rsp)
+; CHECK-VREG-NEXT:    callq f
+; CHECK-VREG-NEXT:  .Ltmp1:
+; CHECK-VREG-NEXT:    movq 8(%rsp), %rbx
+; CHECK-VREG-NEXT:    movq %rbx, %rdi
+; CHECK-VREG-NEXT:    movq %rbx, %rsi
+; CHECK-VREG-NEXT:    xorl %eax, %eax
+; CHECK-VREG-NEXT:    callq use
+; CHECK-VREG-NEXT:    addq $16, %rsp
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-VREG-NEXT:    popq %rbx
+; CHECK-VREG-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-VREG-NEXT:    retq
   %ptr2 = bitcast i32 addrspace(1)* %ptr to i8 addrspace(1)*
   %tok = tail call token (i64, i32, void ()*, i32, i32, ...)
       @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @f, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %ptr, i8 addrspace(1)* %ptr2) ["deopt" (i32 addrspace(1)* %ptr, i32 undef)]