diff --git a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
--- a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
+++ b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
@@ -99,6 +99,15 @@
   using StatepointSpillMapTy = DenseMap<const Value *, Optional<int>>;
   DenseMap<const Instruction *, StatepointSpillMapTy> StatepointSpillMaps;
 
+  /// For each statepoint keep mapping from original derived pointer to
+  /// the index of StatepointSDNode result defining its new value.
+  using DerivedPtrMapTy = DenseMap<const Value *, unsigned>;
+  DenseMap<const Instruction *, DerivedPtrMapTy> DerivedPtrMap;
+
+  /// For each statepoint keep virtual registers its result values has
+  /// been exported to.
+  DenseMap<const Instruction *, SmallVector<unsigned, 8>> StatepointRegs;
+
   /// StaticAllocaMap - Keep track of frame indices for fixed sized allocas in
   /// the entry block.  This allows the allocas to be efficiently referenced
   /// anywhere in the function.
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -81,6 +81,37 @@
   return N;
 }
 
+/// Collect used physical registers up the glue chain.
+static void collectPhysRegsFromGlueChain(const SDNode *Node,
+                                         const TargetInstrInfo *TII,
+                                         SmallVectorImpl<Register> &UsedRegs) {
+  if (Node->getValueType(Node->getNumValues() - 1) != MVT::Glue)
+    return;
+  for (SDNode *F = Node->getGluedUser(); F; F = F->getGluedUser()) {
+    if (F->getOpcode() == ISD::CopyFromReg) {
+      UsedRegs.push_back(cast<RegisterSDNode>(F->getOperand(1))->getReg());
+      continue;
+    } else if (F->getOpcode() == ISD::CopyToReg) {
+      // Skip CopyToReg nodes that are internal to the glue chain.
+      continue;
+    }
+    // Collect declared implicit uses.
+    if (F->isMachineOpcode()) {
+      const MCInstrDesc &MCID = TII->get(F->getMachineOpcode());
+      UsedRegs.append(MCID.getImplicitUses(),
+                      MCID.getImplicitUses() + MCID.getNumImplicitUses());
+    }
+    // In addition to declared implicit uses, we must also check for
+    // direct RegisterSDNode operands.
+    for (unsigned i = 0, e = F->getNumOperands(); i != e; ++i)
+      if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(F->getOperand(i))) {
+        Register Reg = R->getReg();
+        if (Reg.isPhysical())
+          UsedRegs.push_back(Reg);
+      }
+  }
+}
+
 /// EmitCopyFromReg - Generate machine code for an CopyFromReg node or an
 /// implicit physical register output.
 void InstrEmitter::
@@ -940,30 +971,7 @@
     }
   }
 
-  // Scan the glue chain for any used physregs.
-  if (Node->getValueType(Node->getNumValues()-1) == MVT::Glue) {
-    for (SDNode *F = Node->getGluedUser(); F; F = F->getGluedUser()) {
-      if (F->getOpcode() == ISD::CopyFromReg) {
-        UsedRegs.push_back(cast<RegisterSDNode>(F->getOperand(1))->getReg());
-        continue;
-      } else if (F->getOpcode() == ISD::CopyToReg) {
-        // Skip CopyToReg nodes that are internal to the glue chain.
-        continue;
-      }
-      // Collect declared implicit uses.
-      const MCInstrDesc &MCID = TII->get(F->getMachineOpcode());
-      UsedRegs.append(MCID.getImplicitUses(),
-                      MCID.getImplicitUses() + MCID.getNumImplicitUses());
-      // In addition to declared implicit uses, we must also check for
-      // direct RegisterSDNode operands.
-      for (unsigned i = 0, e = F->getNumOperands(); i != e; ++i)
-        if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(F->getOperand(i))) {
-          Register Reg = R->getReg();
-          if (Reg.isPhysical())
-            UsedRegs.push_back(Reg);
-        }
-    }
-  }
+  collectPhysRegsFromGlueChain(Node, TII, UsedRegs);
 
   // Finally mark unused registers as dead.
   if (!UsedRegs.empty() || II.getImplicitDefs() || II.hasOptionalDef())
@@ -1041,6 +1049,44 @@
     break;
   }
 
+  case ISD::GC_STATEPOINT: {
+    StatepointSDNode *SN = cast<StatepointSDNode>(Node);
+    unsigned GCArgStart = SN->getGCArgStart();
+    unsigned NumValues = CountResults(Node);
+    MachineInstrBuilder MIB =
+        BuildMI(*MF, Node->getDebugLoc(), TII->get(TargetOpcode::STATEPOINT));
+
+    for (unsigned i = 0; i < NumValues; ++i) {
+      SDValue Def(SN, i);
+      SDValue Use = SN->getOperand(GCArgStart + 1 + i * 2);
+      unsigned UseReg = getVR(Use, VRBaseMap);
+      unsigned DefReg = MRI->cloneVirtualRegister(UseReg);
+      MIB = MIB.addDef(DefReg);
+      VRBaseMap[Def] = DefReg;
+    }
+
+    unsigned NumOperands = Node->getNumOperands();
+    for (unsigned i = 0; i < NumOperands; ++i) {
+      const SDValue &O = Node->getOperand(i);
+      if (O.getValueType() == MVT::Other || O.getValueType() == MVT::Glue)
+        continue;
+      AddOperand(MIB, O, 0, nullptr, VRBaseMap, false, false, false);
+    }
+
+    unsigned Use = NumValues + GCArgStart + 1;
+    for (unsigned Def = 0; Def < NumValues; ++Def, Use += 2) {
+      MIB->tieOperands(Def, Use);
+    }
+
+    MBB->insert(InsertPos, MIB);
+
+    SmallVector<Register, 8> UsedRegs;
+    collectPhysRegsFromGlueChain(Node, TII, UsedRegs);
+    if (!UsedRegs.empty())
+      MIB->setPhysRegsDeadExcept(UsedRegs, *TRI);
+
+    break;
+  }
   case ISD::INLINEASM:
   case ISD::INLINEASM_BR: {
     unsigned NumOps = Node->getNumOperands();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -2813,6 +2813,7 @@
   case ISD::ANNOTATION_LABEL:
   case ISD::LIFETIME_START:
   case ISD::LIFETIME_END:
+  case ISD::GC_STATEPOINT:
     NodeToMatch->setNodeId(-1); // Mark selected.
     return;
   case ISD::AssertSext:
diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -66,6 +66,10 @@
     "use-registers-for-deopt-values", cl::Hidden, cl::init(false),
     cl::desc("Allow using registers for non pointer deopt args"));
 
+cl::opt<bool> UseRegistersForGCPointers(
+    "use-registers-for-gcptrs", cl::Hidden, cl::init(false),
+    cl::desc("Allow using registers for GC pointer meta args"));
+
 static void pushStackMapConstant(SmallVectorImpl<SDValue>& Ops,
                                  SelectionDAGBuilder &Builder, uint64_t Value) {
   SDLoc L = Builder.getCurSDLoc();
@@ -220,6 +224,11 @@
   return None;
 }
 
+// Return true if V is a GC pointer which need not to be relocated.
+static bool isNonRelocatablePtr(SDValue V) {
+  return (isa<ConstantSDNode>(V) || isa<FrameIndexSDNode>(V));
+}
+
 /// Try to find existing copies of the incoming values in stack slots used for
 /// statepoint spilling.  If we can find a spill slot for the incoming value,
 /// mark that slot as allocated, and reuse the same slot for this safepoint.
@@ -229,11 +238,8 @@
                                              SelectionDAGBuilder &Builder) {
   SDValue Incoming = Builder.getValue(IncomingValue);
 
-  if (isa<ConstantSDNode>(Incoming) || isa<FrameIndexSDNode>(Incoming)) {
-    // We won't need to spill this, so no need to check for previously
-    // allocated stack slots
+  if (isNonRelocatablePtr(Incoming))
     return;
-  }
 
   SDValue OldLocation = Builder.StatepointLowering.getLocation(Incoming);
   if (OldLocation.getNode())
@@ -273,6 +279,31 @@
   Builder.StatepointLowering.setLocation(Incoming, Loc);
 }
 
+/// Sort Ptrs vector so that pointers which need no relocation (constants and
+/// allocas) are placed at the end and those which need relocation are
+/// contiguously occupy beginning of the vector.
+/// Synchroniously update Bases and Relocs vectors.
+static unsigned sortGCPtrs(SmallVectorImpl<const Value *> &Bases,
+                           SmallVectorImpl<const Value *> &Ptrs,
+                           SmallVectorImpl<const GCRelocateInst *> &Relocs,
+                           SelectionDAGBuilder &Builder) {
+  unsigned curPos = 0;
+  for (unsigned i = 0, e = Ptrs.size(); i < e; ++i) {
+    SDValue SDV = Builder.getValue(Ptrs[i]);
+    if (isNonRelocatablePtr(SDV) || SDV.getOpcode() == ISD::UNDEF ||
+        SDV.getValueType().getSizeInBits() > 64) {
+      continue;
+    }
+    if (curPos < i) {
+      std::swap(Bases[curPos], Bases[i]);
+      std::swap(Ptrs[curPos], Ptrs[i]);
+      std::swap(Relocs[curPos], Relocs[i]);
+    }
+    ++curPos;
+  }
+  return curPos;
+}
+
 /// Extract call from statepoint, lower it and return pointer to the
 /// call node. Also update NodeMap so that getValue(statepoint) will
 /// reference lowered call result
@@ -366,7 +397,7 @@
                                  StoreMMO);
 
     MMO = getMachineMemOperand(MF, *cast<FrameIndexSDNode>(Loc));
-    
+
     Builder.StatepointLowering.setLocation(Incoming, Loc);
   }
 
@@ -443,7 +474,9 @@
 /// will be set to the last value spilled (if any were).
 static void
 lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
-                        SmallVectorImpl<MachineMemOperand*> &MemRefs,                                    SelectionDAGBuilder::StatepointLoweringInfo &SI,
+                        SmallVectorImpl<MachineMemOperand *> &MemRefs,
+                        unsigned NumVRegGCArgs, unsigned &GCArgStart,
+                        SelectionDAGBuilder::StatepointLoweringInfo &SI,
                         SelectionDAGBuilder &Builder) {
   // Lower the deopt and gc arguments for this statepoint.  Layout will be:
   // deopt argument length, deopt arguments.., gc arguments...
@@ -500,7 +533,11 @@
   };
 
   auto requireSpillSlot = [&](const Value *V) {
-    return !(LiveInDeopt || UseRegistersForDeoptValues) || isGCValue(V);
+    if (isGCValue(V)) {
+      auto *Ty = V->getType();
+      return Ty->isVectorTy() || !UseRegistersForGCPointers;
+    }
+    return !(LiveInDeopt || UseRegistersForDeoptValues);
   };
 
   // Before we actually start lowering (and allocating spill slots for values),
@@ -512,7 +549,7 @@
     if (requireSpillSlot(V))
       reservePreviousStackSlotForValue(V, Builder);
   }
-  for (unsigned i = 0; i < SI.Bases.size(); ++i) {
+  for (unsigned i = NumVRegGCArgs; i < SI.Bases.size(); ++i) {
     reservePreviousStackSlotForValue(SI.Bases[i], Builder);
     reservePreviousStackSlotForValue(SI.Ptrs[i], Builder);
   }
@@ -540,21 +577,31 @@
                                  Builder);
   }
 
+  GCArgStart = Ops.size();
   // Finally, go ahead and lower all the gc arguments.  There's no prefixed
   // length for this one.  After lowering, we'll have the base and pointer
   // arrays interwoven with each (lowered) base pointer immediately followed by
   // it's (lowered) derived pointer.  i.e
   // (base[0], ptr[0], base[1], ptr[1], ...)
+  // Lower first `NumVRegGCArgs` base AND derived pointers through VRegs.
+  // In future we might use more sophisticated strategy for choosing which
+  // pointers to pass via virtual registers, but for now this simple approach
+  // looks good enough. Take into account these facts:
+  //  - NumVRegGCArgs is limited by the  max number of tied registers in MI;
+  //  - We relocate (and so need tied defs for) only derived pointers;
+  //  - Quite often base and derived pointer are the same.
+  auto &SL = Builder.StatepointLowering;
   for (unsigned i = 0; i < SI.Bases.size(); ++i) {
+    bool RequireSpillSlot = (i >= NumVRegGCArgs);
     const Value *Base = SI.Bases[i];
-    lowerIncomingStatepointValue(Builder.getValue(Base),
-                                 /*RequireSpillSlot*/ true, Ops, MemRefs,
-                                 Builder);
+    lowerIncomingStatepointValue(Builder.getValue(Base), RequireSpillSlot, Ops,
+                                 MemRefs, Builder);
 
     const Value *Ptr = SI.Ptrs[i];
-    lowerIncomingStatepointValue(Builder.getValue(Ptr),
-                                 /*RequireSpillSlot*/ true, Ops, MemRefs,
-                                 Builder);
+    SDValue SDV = Builder.getValue(Ptr);
+    lowerIncomingStatepointValue(SDV, RequireSpillSlot, Ops, MemRefs, Builder);
+    if (!RequireSpillSlot && !SL.getLocation(SDV))
+      SL.setLocation(SDV, Builder.DAG.getConstant(i, SDLoc(), MVT::i64));
   }
 
   // If there are any explicit spill slots passed to the statepoint, record
@@ -582,6 +629,7 @@
   // values, while previous loops account only values with unique SDValues.
   const Instruction *StatepointInstr = SI.StatepointInstr;
   auto &SpillMap = Builder.FuncInfo.StatepointSpillMaps[StatepointInstr];
+  auto &DPtrMap = Builder.FuncInfo.DerivedPtrMap[StatepointInstr];
 
   for (const GCRelocateInst *Relocate : SI.GCRelocates) {
     const Value *V = Relocate->getDerivedPtr();
@@ -589,7 +637,12 @@
     SDValue Loc = Builder.StatepointLowering.getLocation(SDV);
 
     if (Loc.getNode()) {
-      SpillMap[V] = cast<FrameIndexSDNode>(Loc)->getIndex();
+      if (auto FI = dyn_cast<FrameIndexSDNode>(Loc))
+        SpillMap[V] = FI->getIndex();
+      else {
+        DPtrMap[V] = cast<ConstantSDNode>(Loc)->getZExtValue();
+        SpillMap[V] = None;
+      }
     } else {
       // Record value as visited, but not spilled. This is case for allocas
       // and constants. For this values we can avoid emitting spill load while
@@ -629,10 +682,27 @@
       StatepointLowering.scheduleRelocCall(*Reloc);
 #endif
 
+  unsigned NumVRegs = 0;
+
+  if (UseRegistersForGCPointers) {
+    const unsigned MaxTiedRegs = 15U;
+
+    // Sort vectors so that elements which need relocation are laid out
+    // contiguously at the beginning of vectors.
+    // This is dictated by the StatepointSDNode implementation: due to size limit,
+    // one cannot put vector into class derived from SDNode, we can only keep a
+    // number of relocations. So N results of StatepointSDNode map 1-1 to the
+    // first N derived pointers.
+    NumVRegs = sortGCPtrs(SI.Bases, SI.Ptrs, SI.GCRelocates, *this);
+    NumVRegs = std::min(NumVRegs, MaxTiedRegs);
+  }
+
   // Lower statepoint vmstate and gcstate arguments
   SmallVector<SDValue, 10> LoweredMetaArgs;
   SmallVector<MachineMemOperand*, 16> MemRefs;
-  lowerStatepointMetaArgs(LoweredMetaArgs, MemRefs, SI, *this);
+  unsigned GCArgStart;
+  lowerStatepointMetaArgs(LoweredMetaArgs, MemRefs, NumVRegs, GCArgStart, SI,
+                          *this);
 
   // Now that we've emitted the spills, we need to update the root so that the
   // call sequence is ordered correctly.
@@ -732,6 +802,7 @@
   pushStackMapConstant(Ops, *this, Flags);
 
   // Insert all vmstate and gcstate arguments
+  GCArgStart += Ops.size();
   Ops.insert(Ops.end(), LoweredMetaArgs.begin(), LoweredMetaArgs.end());
 
   // Add register mask from call node
@@ -744,15 +815,10 @@
   if (Glue.getNode())
     Ops.push_back(Glue);
 
-  // Compute return values.  Provide a glue output since we consume one as
-  // input.  This allows someone else to chain off us as needed.
-  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-
-  MachineSDNode *StatepointMCNode =
-    DAG.getMachineNode(TargetOpcode::STATEPOINT, getCurSDLoc(), NodeTys, Ops);
-  DAG.setNodeMemRefs(StatepointMCNode, MemRefs);
+  SDValue STV =
+      DAG.getStatepoint(getCurSDLoc(), ReturnVal, GCArgStart, NumVRegs, Ops);
 
-  SDNode *SinkNode = StatepointMCNode;
+  SDNode *SinkNode = STV.getNode();
 
   // Build the GC_TRANSITION_END node if necessary.
   //
@@ -762,7 +828,7 @@
     SmallVector<SDValue, 8> TEOps;
 
     // Add chain
-    TEOps.push_back(SDValue(StatepointMCNode, 0));
+    TEOps.push_back(SDValue(STV.getNode(), STV->getNumValues() - 2));
 
     // Add GC transition arguments
     for (const Value *V : SI.GCTransitionArgs) {
@@ -772,7 +838,7 @@
     }
 
     // Add glue
-    TEOps.push_back(SDValue(StatepointMCNode, 1));
+    TEOps.push_back(SDValue(STV.getNode(), STV->getNumValues() - 1));
 
     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
 
@@ -783,7 +849,12 @@
   }
 
   // Replace original call
-  DAG.ReplaceAllUsesWith(CallNode, SinkNode); // This may update Root
+  // Call: ch,glue = CALL ...
+  // Statepoint: [gc relocates],ch,glue = GC_STATEPOINT ...
+  unsigned NumSinkValues = SinkNode->getNumValues();
+  SDValue StatepointValues[2] = {SDValue(SinkNode, NumSinkValues - 2),
+                                 SDValue(SinkNode, NumSinkValues - 1)};
+  DAG.ReplaceAllUsesWith(CallNode, StatepointValues);
   // Remove original call node
   DAG.DeleteNode(CallNode);
 
@@ -796,7 +867,7 @@
   // previously emitted STATEPOINT value.  Unfortunately, this doesn't appear
   // to actually be possible today.
 
-  return ReturnVal;
+  return STV;
 }
 
 void
@@ -870,21 +941,46 @@
   SI.NumPatchBytes = ISP.getNumPatchBytes();
   SI.EHPadBB = EHPadBB;
 
-  SDValue ReturnValue = LowerAsSTATEPOINT(SI);
+  SDValue STV = LowerAsSTATEPOINT(SI);
 
   // Export the result value if needed
+  const BasicBlock *BB = ISP.getCall()->getParent();
+  std::vector<const GCRelocateInst *> RV = ISP.getRelocates();
+  bool NeedExport = llvm::any_of(
+      RV, [&BB](const GCRelocateInst *I) { return I->getParent() != BB; });
+
+  // If any of relocates or result value will be used in different basic
+  // block, we need to export them manually. Default exporting mechanism
+  // will not work here because it is based on IR Value types, and
+  // IR statepoint has different type than the actual call or relocates.
+  // It means that by default llvm will create export register of the wrong
+  // type (always i32 - TokenTy - in our case). So instead we need to create
+  // export registers manually.
+  // TODO: To eliminate this problem we can remove gc.result/gc.relocate
+  //       intrinsics completely and make statepoint call to return a tuple.
+  setValue(ISP.getInstruction(), STV);
+  if (NeedExport) {
+    LLVMContext *Context = DAG.getContext();
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    const DataLayout &DL = DAG.getDataLayout();
+    for (unsigned i = 0, e = STV->getNumValues() - 2; i < e; ++i) {
+      Value *DerivedPtr = SI.GCRelocates[i]->getDerivedPtr();
+      SDValue Res(STV.getNode(), i);
+      Type *Ty = DerivedPtr->getType();
+      unsigned Reg = FuncInfo.CreateRegs(Ty);
+      RegsForValue RFV(*Context, TLI, DL, Reg, Ty, None);
+      SDValue Chain = DAG.getEntryNode();
+
+      RFV.getCopyToRegs(Res, DAG, getCurSDLoc(), Chain, nullptr, DerivedPtr);
+      PendingExports.push_back(Chain);
+      FuncInfo.StatepointRegs[SI.StatepointInstr].push_back(Reg);
+    }
+  }
   const GCResultInst *GCResult = ISP.getGCResult();
   Type *RetTy = ISP.getActualReturnType();
   if (!RetTy->isVoidTy() && GCResult) {
     if (GCResult->getParent() != ISP.getCall()->getParent()) {
-      // Result value will be used in a different basic block so we need to
-      // export it now.  Default exporting mechanism will not work here because
-      // statepoint call has a different type than the actual call. It means
-      // that by default llvm will create export register of the wrong type
-      // (always i32 in our case). So instead we need to create export register
-      // with correct type manually.
-      // TODO: To eliminate this problem we can remove gc.result intrinsics
-      //       completely and make statepoint call to return a tuple.
+      SDValue ReturnValue = cast<StatepointSDNode>(STV)->getActualRetVal();
       unsigned Reg = FuncInfo.CreateRegs(RetTy);
       RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
                        DAG.getDataLayout(), Reg, RetTy,
@@ -894,16 +990,7 @@
       RFV.getCopyToRegs(ReturnValue, DAG, getCurSDLoc(), Chain, nullptr);
       PendingExports.push_back(Chain);
       FuncInfo.ValueMap[ISP.getInstruction()] = Reg;
-    } else {
-      // Result value will be used in a same basic block. Don't export it or
-      // perform any explicit register copies.
-      // We'll replace the actuall call node shortly. gc_result will grab
-      // this value.
-      setValue(ISP.getInstruction(), ReturnValue);
     }
-  } else {
-    // The token value is never used from here on, just generate a poison value
-    setValue(ISP.getInstruction(), DAG.getIntPtrConstant(-1, getCurSDLoc()));
   }
 }
 
@@ -934,7 +1021,9 @@
 
   // NB! The GC arguments are deliberately left empty.
 
-  if (SDValue ReturnVal = LowerAsSTATEPOINT(SI)) {
+  SDValue STV = LowerAsSTATEPOINT(SI);
+  if (SDValue ReturnVal =
+          cast<StatepointSDNode>(STV.getNode())->getActualRetVal()) {
     ReturnVal = lowerRangeToAssertZExt(DAG, *Call, ReturnVal);
     setValue(Call, ReturnVal);
   }
@@ -968,17 +1057,19 @@
     assert(CopyFromReg.getNode());
     setValue(&CI, CopyFromReg);
   } else {
-    setValue(&CI, getValue(I));
+    StatepointSDNode *STN = cast<StatepointSDNode>(getValue(I));
+    setValue(&CI, STN->getActualRetVal());
   }
 }
 
 void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) {
+  const BasicBlock *StatepointBB = Relocate.getStatepoint()->getParent();
 #ifndef NDEBUG
   // Consistency check
   // We skip this check for relocates not in the same basic block as their
   // statepoint. It would be too expensive to preserve validation info through
   // different basic blocks.
-  if (Relocate.getStatepoint()->getParent() == Relocate.getParent())
+  if (StatepointBB == Relocate.getParent())
     StatepointLowering.relocCallVisited(Relocate);
 
   auto *Ty = Relocate.getType()->getScalarType();
@@ -994,6 +1085,33 @@
   assert(SlotIt != SpillMap.end() && "Relocating not lowered gc value");
   Optional<int> DerivedPtrLocation = SlotIt->second;
 
+  auto &DPtrMap = FuncInfo.DerivedPtrMap[Relocate.getStatepoint()];
+  auto It = DPtrMap.find(Relocate.getDerivedPtr());
+  if (It != DPtrMap.end()) {
+    // This GC ptr is lowered through VReg.
+    unsigned Index = It->second;
+    SDValue Result;
+    auto &StatepointRegs = FuncInfo.StatepointRegs[Relocate.getStatepoint()];
+    if (StatepointBB != Relocate.getParent()) {
+      // Statepoint is in different basic block. Default getValue() mechanism
+      // does not work here, so we need create CopyFromRegs manually.
+      // See comment in LowerStatepoint for details.
+      assert(Index < StatepointRegs.size());
+      unsigned InReg = StatepointRegs[Index];
+      RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
+                       DAG.getDataLayout(), InReg, DerivedPtr->getType(),
+                       None); // This is not an ABI copy.
+      SDValue Chain = DAG.getEntryNode();
+      Result = RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr,
+                                   DerivedPtr);
+    } else {
+      SDNode *Statepoint = getValue(Relocate.getStatepoint()).getNode();
+      Result = SDValue(Statepoint, Index);
+    }
+    setValue(&Relocate, Result);
+    return;
+  }
+
   // We didn't need to spill these special cases (constants and allocas).
   // See the handling in spillIncomingValueForStatepoint for detail.
   if (!DerivedPtrLocation) {
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1014,11 +1014,10 @@
   // STATEPOINT Deopt Alloca - live-through, read only, direct
   // (We're currently conservative and mark the deopt slots read/write in
   // practice.)
-  // STATEPOINT GC Spill - live-through, read/write, indirect
+  // STATEPOINT GC Spill - live-through, read/write, indirect or vreg
   // STATEPOINT GC Alloca - live-through, read/write, direct
-  // The live-in vs live-through is handled already (the live through ones are
-  // all stack slots), but we need to handle the different type of stackmap
-  // operands and memory effects here.
+  // The live-in vs live-through is handled already but we need to handle
+  // the different type of stackmap operands and memory effects here.
 
   // MI changes inside this loop as we grow operands.
   for(unsigned OperIdx = 0; OperIdx != MI->getNumOperands(); ++OperIdx) {
@@ -1031,9 +1030,19 @@
     int FI = MO.getIndex();
     MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), MI->getDesc());
 
+    auto transferOperand = [MIB, MI](unsigned i) {
+      const MachineOperand &O = MI->getOperand(i);
+      unsigned TiedTo = i;
+      if (O.isReg() && O.isTied())
+        TiedTo = MI->findTiedOperandIdx(i);
+      MIB.add(O);
+      if (TiedTo < i)
+        MIB->tieOperands(TiedTo, MIB->getNumOperands() - 1);
+    };
+
     // Copy operands before the frame-index.
     for (unsigned i = 0; i < OperIdx; ++i)
-      MIB.add(MI->getOperand(i));
+      transferOperand(i);
     // Add frame index operands recognized by stackmaps.cpp
     if (MFI.isStatepointSpillSlotObjectIndex(FI)) {
       // indirect-mem-ref tag, size, #FI, offset.
@@ -1054,7 +1063,7 @@
     }
     // Copy the operands after the frame index.
     for (unsigned i = OperIdx + 1; i != MI->getNumOperands(); ++i)
-      MIB.add(MI->getOperand(i));
+      transferOperand(i);
 
     // Inherit previous memory operands.
     MIB.cloneMemRefs(*MI);