Index: llvm/include/llvm/CodeGen/TargetRegisterInfo.h
===================================================================
--- llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -702,6 +702,14 @@
     return RC;
   }
 
+  /// Returns the needed register class for a virtual register involved in
+  /// copying a value to/from PhysReg. If nullptr is returned the default
+  /// register class based on the type is used.
+  virtual const TargetRegisterClass *
+  getPhysRegCopyRegClass(unsigned PhysReg) const {
+    return nullptr;
+  }
+
   /// Returns the largest super class of RC that is legal to use in the current
   /// sub-target and has the same spill size.
   /// The returned register class can be used to create virtual registers which
Index: llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -166,6 +166,8 @@
   } else {
     DstRC = TLI->getRegClassFor(VT, Node->isDivergent());
   }
+  if (auto *CopyRC = TRI->getPhysRegCopyRegClass(SrcReg))
+    DstRC = CopyRC;
 
   // If all uses are reading from the src physical register and copying the
   // register is either impossible or very expensive, then don't create a copy.
@@ -1017,6 +1019,13 @@
     if (SrcReg == DestReg) // Coalesced away the copy? Ignore.
       break;
 
+    if (auto *CopyRC = TRI->getPhysRegCopyRegClass(DestReg)) {
+      unsigned TmpReg = MRI->createVirtualRegister(CopyRC);
+      BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY),
+            TmpReg).addReg(SrcReg);
+      SrcReg = TmpReg;
+    }
+
     BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY),
             DestReg).addReg(SrcReg);
     break;
Index: llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
===================================================================
--- llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -58,6 +58,9 @@
   const TargetRegisterClass *
   getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
 
+  const TargetRegisterClass *
+  getPhysRegCopyRegClass(unsigned PhysReg) const override;
+
   bool getRegAllocationHints(unsigned VirtReg,
                              ArrayRef<MCPhysReg> Order,
                              SmallVectorImpl<MCPhysReg> &Hints,
Index: llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
===================================================================
--- llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -421,3 +421,10 @@
   return RC;
 }
 
+const TargetRegisterClass *
+SystemZRegisterInfo::getPhysRegCopyRegClass(unsigned PhysReg) const {
+  // EAR / SAR can only use GR32 registers.
+  if (SystemZ::AR32BitRegClass.contains(PhysReg))
+    return &SystemZ::GR32BitRegClass;
+  return nullptr;
+}
Index: llvm/test/CodeGen/SystemZ/tls-08.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/SystemZ/tls-08.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -mcpu=z196 -mtriple=s390x-linux-gnu -O0 -stop-after=finalize-isel \
+; RUN:   2>&1 | FileCheck %s
+; REQUIRES: asserts
+;
+; Test that copies to/from access registers are emitted from isel with GR32 regs.
+
+@x = dso_local thread_local global i32 0, align 4
+define weak_odr hidden i32* @fun0() {
+; CHECK: name:            fun0
+; CHECK: %{{[0-9]+}}:gr32bit = COPY $a0
+; CHECK: %{{[0-9]+}}:gr32bit = COPY $a1
+  ret i32* @x
+}
+
+define i32 @fun1() {
+; CHECK: name:            fun1
+; CHECK: %1:gr32bit = COPY %0
+; CHECK: $a1 = COPY %1
+; CHECK: %2:gr32bit = COPY $a0
+  %val = call i32 asm "blah", "={a0}, {a1}" (i32 0)
+  ret i32 %val
+}
Index: llvm/test/CodeGen/SystemZ/tls-09.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/SystemZ/tls-09.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -mcpu=z196 -mtriple=s390x-linux-gnu -O0
+;
+; Test that a0 and a1 are copied successfully into GR32 registers.
+
+@x = dso_local thread_local global i32 0, align 4
+define i32 @fun0(i32 signext, i32 signext, i32 signext, i32 signext, i32 signext, i32 signext, i32 signext)  {
+  %8 = alloca i32, align 4
+  %9 = alloca i32, align 4
+  %10 = alloca i32, align 4
+  %11 = alloca i32, align 4
+  %12 = alloca i32, align 4
+  %13 = alloca i32, align 4
+  %14 = alloca i32, align 4
+  %15 = load i32, i32* @x, align 4
+  store i32 %0, i32* %8, align 4
+  store i32 %1, i32* %9, align 4
+  store i32 %2, i32* %10, align 4
+  store i32 %3, i32* %11, align 4
+  store i32 %4, i32* %12, align 4
+  store i32 %5, i32* %13, align 4
+  store i32 %6, i32* %14, align 4
+  %16 = load i32, i32* %8, align 4
+  %17 = add nsw i32 %15, %16
+  %18 = load i32, i32* %9, align 4
+  %19 = add nsw i32 %17, %18
+  %20 = load i32, i32* %10, align 4
+  %21 = add nsw i32 %19, %20
+  %22 = load i32, i32* %11, align 4
+  %23 = add nsw i32 %21, %22
+  %24 = load i32, i32* %12, align 4
+  %25 = add nsw i32 %23, %24
+  %26 = load i32, i32* %13, align 4
+  %27 = add nsw i32 %25, %26
+  %28 = load i32, i32* %14, align 4
+  %29 = add nsw i32 %27, %28
+  ret i32 %29
+}