diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -40,6 +40,7 @@
 class GCNPreRAOptimizations : public MachineFunctionPass {
 private:
   const SIInstrInfo *TII;
+  const SIRegisterInfo *TRI;
   MachineRegisterInfo *MRI;
   LiveIntervals *LIS;
 
@@ -85,32 +86,106 @@
   MachineInstr *Def0 = nullptr;
   MachineInstr *Def1 = nullptr;
   uint64_t Init = 0;
+  bool Changed = false;
+  SmallSet<Register, 32> ModifiedRegs;
+  bool IsAGPRDst = TRI->isAGPRClass(MRI->getRegClass(Reg));
 
   for (MachineInstr &I : MRI->def_instructions(Reg)) {
-    if (I.getOpcode() != AMDGPU::S_MOV_B32 || I.getOperand(0).getReg() != Reg ||
-        !I.getOperand(1).isImm() || I.getNumOperands() != 2)
-      return false;
-
-    switch (I.getOperand(0).getSubReg()) {
+    switch (I.getOpcode()) {
     default:
       return false;
-    case AMDGPU::sub0:
-      if (Def0)
-        return false;
-      Def0 = &I;
-      Init |= I.getOperand(1).getImm() & 0xffffffff;
+    case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
       break;
-    case AMDGPU::sub1:
-      if (Def1)
+    case AMDGPU::COPY: {
+      // Some subtargets cannot do an AGPR to AGPR copy directly, and need an
+      // intermdiate temporary VGPR register. Try to find the defining
+      // accvgpr_write to avoid temporary registers.
+      if (!IsAGPRDst)
+        break;
+
+      Register SrcReg = I.getOperand(1).getReg();
+
+      if (!SrcReg.isVirtual())
+        break;
+
+      // Check if source of copy is from another AGPR.
+      bool IsAGPRSrc = TRI->isAGPRClass(MRI->getRegClass(SrcReg));
+      if (!IsAGPRSrc)
+        break;
+
+      // def_instructions() does not look at subregs so it may give us a
+      // different instruction that defines the same vreg but different subreg
+      // so we have to manually check subreg.
+      Register SrcSubReg = I.getOperand(1).getSubReg();
+      for (auto &Def : MRI->def_instructions(SrcReg)) {
+        if (SrcSubReg != Def.getOperand(0).getSubReg())
+          continue;
+
+        if (Def.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
+          MachineOperand DefSrcMO = Def.getOperand(1);
+
+          // Immediates are not an issue and can be propagated in
+          // postrapseudos pass. Only handle cases where defining
+          // accvgpr_write source is a vreg.
+          if (DefSrcMO.isReg() && DefSrcMO.getReg().isVirtual()) {
+            // Propagate source reg of accvgpr write to this copy instruction
+            I.getOperand(1).setReg(DefSrcMO.getReg());
+            I.getOperand(1).setSubReg(DefSrcMO.getSubReg());
+
+            // Reg uses were changed, collect unique set of registers to update
+            // live intervals at the end.
+            ModifiedRegs.insert(DefSrcMO.getReg());
+            ModifiedRegs.insert(SrcReg);
+
+            Changed = true;
+          }
+
+          // Found the defining accvgpr_write, stop looking any further.
+          break;
+        }
+      }
+      break;
+    }
+    case AMDGPU::S_MOV_B32:
+      if (I.getOperand(0).getReg() != Reg || !I.getOperand(1).isImm() ||
+          I.getNumOperands() != 2)
+        return false;
+
+      switch (I.getOperand(0).getSubReg()) {
+      default:
         return false;
-      Def1 = &I;
-      Init |= static_cast<uint64_t>(I.getOperand(1).getImm()) << 32;
+      case AMDGPU::sub0:
+        if (Def0)
+          return false;
+        Def0 = &I;
+        Init |= I.getOperand(1).getImm() & 0xffffffff;
+        break;
+      case AMDGPU::sub1:
+        if (Def1)
+          return false;
+        Def1 = &I;
+        Init |= static_cast<uint64_t>(I.getOperand(1).getImm()) << 32;
+        break;
+      }
       break;
     }
   }
 
+  // For AGPR reg, check if live intervals need to be updated.
+  if (IsAGPRDst) {
+    if (Changed) {
+      for (Register RegToUpdate : ModifiedRegs) {
+        LIS->removeInterval(RegToUpdate);
+        LIS->createAndComputeVirtRegInterval(RegToUpdate);
+      }
+    }
+
+    return Changed;
+  }
+
+  // For SGPR reg, check if we can combine instructions.
   if (!Def0 || !Def1 || Def0->getParent() != Def1->getParent())
-    return false;
+    return Changed;
 
   LLVM_DEBUG(dbgs() << "Combining:\n  " << *Def0 << "  " << *Def1
                     << "    =>\n");
@@ -144,7 +219,7 @@
   TII = ST.getInstrInfo();
   MRI = &MF.getRegInfo();
   LIS = &getAnalysis<LiveIntervals>();
-  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  TRI = ST.getRegisterInfo();
 
   bool Changed = false;
 
@@ -153,8 +228,10 @@
     if (!LIS->hasInterval(Reg))
       continue;
     const TargetRegisterClass *RC = MRI->getRegClass(Reg);
-    if (RC->MC->getSizeInBits() != 64 || !TRI->isSGPRClass(RC))
+    if ((RC->MC->getSizeInBits() != 64 || !TRI->isSGPRClass(RC)) &&
+        (ST.hasGFX90AInsts() || !TRI->isAGPRClass(RC)))
       continue;
+
     Changed |= processReg(Reg);
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-to-agpr-copy.mir b/llvm/test/CodeGen/AMDGPU/agpr-to-agpr-copy.mir
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/agpr-to-agpr-copy.mir
@@ -0,0 +1,139 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=liveintervals,amdgpu-pre-ra-optimizations -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX908 %s
+
+---
+name: test_mfma_f32_4x4x1f32_propagate_vgpr
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+    ; GFX908-LABEL: name: test_mfma_f32_4x4x1f32_propagate_vgpr
+    ; GFX908: liveins: $sgpr0_sgpr1
+    ; GFX908: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+    ; GFX908: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+    ; GFX908: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX908: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1123418112, implicit $exec
+    ; GFX908: undef %4.sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 [[V_MOV_B32_e32_1]], implicit $exec
+    ; GFX908: %4.sub1:areg_128 = COPY [[V_MOV_B32_e32_1]]
+    ; GFX908: %4.sub2:areg_128 = COPY [[V_MOV_B32_e32_1]]
+    ; GFX908: %4.sub3:areg_128 = COPY [[V_MOV_B32_e32_1]]
+    ; GFX908: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1073741824, implicit $exec
+    ; GFX908: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
+    ; GFX908: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[V_MOV_B32_e32_3]], [[V_MOV_B32_e32_2]], %4, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX908: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_F32_4X4X1F32_e64_]]
+    ; GFX908: GLOBAL_STORE_DWORDX4_SADDR [[V_MOV_B32_e32_]], [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; GFX908: S_ENDPGM 0
+    %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+    %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1:sgpr_64(p4), 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+    %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %13:vgpr_32 = V_MOV_B32_e32 1123418112, implicit $exec
+    undef %11.sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 %13:vgpr_32, implicit $exec
+    %11.sub1:areg_128 = COPY %11.sub0:areg_128
+    %11.sub2:areg_128 = COPY %11.sub0:areg_128
+    %11.sub3:areg_128 = COPY %11.sub0:areg_128
+    %8:vgpr_32 = V_MOV_B32_e32 1073741824, implicit $exec
+    %9:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
+    %10:areg_128 = V_MFMA_F32_4X4X1F32_e64 %9:vgpr_32, %8:vgpr_32, %11:areg_128, 0, 0, 0, implicit $mode, implicit $exec
+    %12:vreg_128 = COPY %10:areg_128
+    GLOBAL_STORE_DWORDX4_SADDR %5:vgpr_32, %12:vreg_128, %4:sreg_64_xexec, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    S_ENDPGM 0
+...
+---
+name: test_mfma_f32_4x4x1f32_no_propagate_imm
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+    ; GFX908-LABEL: name: test_mfma_f32_4x4x1f32_no_propagate_imm
+    ; GFX908: liveins: $sgpr0_sgpr1
+    ; GFX908: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+    ; GFX908: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+    ; GFX908: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX908: undef %3.sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 1073741824, implicit $exec
+    ; GFX908: %3.sub1:areg_128 = COPY %3.sub0
+    ; GFX908: %3.sub2:areg_128 = COPY %3.sub0
+    ; GFX908: %3.sub3:areg_128 = COPY %3.sub0
+    ; GFX908: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1073741824, implicit $exec
+    ; GFX908: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
+    ; GFX908: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[V_MOV_B32_e32_2]], [[V_MOV_B32_e32_1]], %3, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX908: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_F32_4X4X1F32_e64_]]
+    ; GFX908: GLOBAL_STORE_DWORDX4_SADDR [[V_MOV_B32_e32_]], [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; GFX908: S_ENDPGM 0
+    %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+    %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1:sgpr_64(p4), 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+    %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    undef %11.sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 1073741824, implicit $exec
+    %11.sub1:areg_128 = COPY %11.sub0:areg_128
+    %11.sub2:areg_128 = COPY %11.sub0:areg_128
+    %11.sub3:areg_128 = COPY %11.sub0:areg_128
+    %8:vgpr_32 = V_MOV_B32_e32 1073741824, implicit $exec
+    %9:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
+    %10:areg_128 = V_MFMA_F32_4X4X1F32_e64 %9:vgpr_32, %8:vgpr_32, %11:areg_128, 0, 0, 0, implicit $mode, implicit $exec
+    %12:vreg_128 = COPY %10:areg_128
+    GLOBAL_STORE_DWORDX4_SADDR %5:vgpr_32, %12:vreg_128, %4:sreg_64_xexec, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    S_ENDPGM 0
+...
+---
+name: test_vgpr_subreg_propagate
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GFX908-LABEL: name: test_vgpr_subreg_propagate
+    ; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GFX908: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
+    ; GFX908: undef %1.sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub0, implicit $exec
+    ; GFX908: %1.sub1:areg_128 = COPY [[COPY]].sub0
+    ; GFX908: %1.sub2:areg_128 = COPY [[COPY]].sub0
+    ; GFX908: %1.sub3:areg_128 = COPY [[COPY]].sub0
+    ; GFX908: S_ENDPGM 0, implicit [[COPY]], implicit %1
+    %0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
+    undef %1.sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 %0.sub0, implicit $exec
+    %1.sub1:areg_128 = COPY %1.sub0:areg_128
+    %1.sub2:areg_128 = COPY %1.sub0:areg_128
+    %1.sub3:areg_128 = COPY %1.sub0:areg_128
+    S_ENDPGM 0, implicit %0, implicit %1
+...
+---
+name: test_nonmatching_agpr_subreg_no_propagate
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    ; GFX908-LABEL: name: test_nonmatching_agpr_subreg_no_propagate
+    ; GFX908: liveins: $vgpr0_vgpr1
+    ; GFX908: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1, implicit $exec
+    ; GFX908: undef %1.sub0:areg_64 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub0, implicit $exec
+    ; GFX908: %1.sub1:areg_64 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub1, implicit $exec
+    ; GFX908: [[COPY1:%[0-9]+]]:areg_64 = COPY %1
+    ; GFX908: S_ENDPGM 0, implicit [[COPY]], implicit %1, implicit [[COPY1]]
+    %0:vreg_64 = COPY $vgpr0_vgpr1, implicit $exec
+    undef %1.sub0:areg_64 = V_ACCVGPR_WRITE_B32_e64 %0.sub0, implicit $exec
+    %1.sub1:areg_64 = V_ACCVGPR_WRITE_B32_e64 %0.sub1, implicit $exec
+    %2:areg_64 = COPY %1:areg_64
+    S_ENDPGM 0, implicit %0, implicit %1, implicit %2
+...
+---
+name: test_subreg_to_single_agpr_reg_propagate
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    ; GFX908-LABEL: name: test_subreg_to_single_agpr_reg_propagate
+    ; GFX908: liveins: $vgpr0_vgpr1
+    ; GFX908: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1, implicit $exec
+    ; GFX908: undef %1.sub0:areg_64 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub0, implicit $exec
+    ; GFX908: %1.sub1:areg_64 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub1, implicit $exec
+    ; GFX908: [[COPY1:%[0-9]+]]:agpr_32 = COPY [[COPY]].sub1
+    ; GFX908: S_ENDPGM 0, implicit [[COPY]], implicit %1, implicit [[COPY1]]
+    %0:vreg_64 = COPY $vgpr0_vgpr1, implicit $exec
+    undef %1.sub0:areg_64 = V_ACCVGPR_WRITE_B32_e64 %0.sub0, implicit $exec
+    %1.sub1:areg_64 = V_ACCVGPR_WRITE_B32_e64 %0.sub1, implicit $exec
+    %2:agpr_32 = COPY %1.sub1:areg_64
+    S_ENDPGM 0, implicit %0, implicit %1, implicit %2
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -578,19 +578,12 @@
   ret void
 }
 
-; FIXME: Resulting code for splat is pretty bad. A v_mov_b32 is moved
-; in the middle of the expanded agpr reg_sequence. The broadcast of
-; the individual AGPR->AGPR components should avoid the intermediate AGPR case.
 ; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_lit_splat_bad_code:
 ; GFX908_A: v_mov_b32_e32 [[TMP0:v[0-9]+]], 0x42f60000
 ; GCN:      v_accvgpr_write_b32 [[AGPR:a[0-9]+]], [[TMP0]]
-; GFX908:   s_nop 0
-; GFX908:   v_accvgpr_read_b32 [[TMP1:v[0-9]+]], [[AGPR]]
-; GFX908:   v_accvgpr_read_b32 [[TMP2:v[0-9]+]], [[AGPR]]
-; GFX908:   v_accvgpr_read_b32 [[TMP3:v[0-9]+]], [[AGPR]]
-; GFX908:   v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
-; GFX908:   v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
-; GFX908:   v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
+; GFX908-NEXT:   v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP0]]
+; GFX908-NEXT:   v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP0]]
+; GFX908-NEXT:   v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP0]]
 ; GFX90A-COUNT-3: v_accvgpr_mov_b32 a{{[0-9]+}}, [[AGPR]]
 ; GCN: s_nop 0
 ; GFX908_A:  v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]