Index: llvm/lib/Target/AMDGPU/AMDGPU.h
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPU.h
+++ llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -56,6 +56,7 @@
 ModulePass *createAMDGPULowerModuleLDSPass();
 FunctionPass *createSIModeRegisterPass();
 FunctionPass *createGCNPreRAOptimizationsPass();
+FunctionPass *createGCNPreRaFixupsPass();
 
 struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> {
   AMDGPUSimplifyLibCallsPass(TargetMachine &TM) : TM(TM) {}
@@ -334,6 +335,9 @@
 void initializeGCNPreRAOptimizationsPass(PassRegistry &);
 extern char &GCNPreRAOptimizationsID;
 
+void initializeGCNPreRaFixupsPass(PassRegistry &);
+extern char &GCNPreRaFixupsID;
+
 namespace AMDGPU {
 enum TargetIndex {
   TI_CONSTDATA_START,
Index: llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -378,6 +378,7 @@
   initializeAMDGPUResourceUsageAnalysisPass(*PR);
   initializeGCNNSAReassignPass(*PR);
   initializeGCNPreRAOptimizationsPass(*PR);
+  initializeGCNPreRaFixupsPass(*PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -1223,6 +1224,7 @@
 
   insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
   insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID);
+  insertPass(&TwoAddressInstructionPassID, &GCNPreRaFixupsID);
 
   TargetPassConfig::addFastRegAlloc();
 }
@@ -1239,6 +1241,8 @@
   if (isPassEnabled(EnablePreRAOptimizations))
     insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID);
 
+  insertPass(&ProcessImplicitDefsID, &GCNPreRaFixupsID);
+
   // This is not an essential optimization and it has a noticeable impact on
   // compilation time, so we only enable it from O2.
   if (TM->getOptLevel() > CodeGenOpt::Less)
Index: llvm/lib/Target/AMDGPU/CMakeLists.txt
===================================================================
--- llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -106,6 +106,7 @@
   GCNPreRAOptimizations.cpp
   GCNRegPressure.cpp
   GCNSchedStrategy.cpp
+  GCNPreRaFixups.cpp
   R600AsmPrinter.cpp
   R600ClauseMergePass.cpp
   R600ControlFlowFinalizer.cpp
Index: llvm/lib/Target/AMDGPU/GCNPreRaFixups.cpp
===================================================================
--- /dev/null
+++ llvm/lib/Target/AMDGPU/GCNPreRaFixups.cpp
@@ -0,0 +1,97 @@
+//===--------------------- GCNPreRaFixups.cpp  ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "gcn-pre-ra-fixups"
+
+namespace {
+
+class GCNPreRaFixups : public MachineFunctionPass {
+private:
+  const SIRegisterInfo *TRI = nullptr;
+  const SIInstrInfo *TII = nullptr;
+  MachineRegisterInfo *MRI = nullptr;
+
+  bool handleMAI(MachineInstr &MI) const;
+
+public:
+  static char ID;
+
+  GCNPreRaFixups() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "GCN Pre-RA Fixups";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // end anonymous namespace
+
+char GCNPreRaFixups::ID = 0;
+
+INITIALIZE_PASS(GCNPreRaFixups, DEBUG_TYPE, "GCN Pre-RA Fixups", false, false)
+
+char &llvm::GCNPreRaFixupsID = GCNPreRaFixups::ID;
+
+FunctionPass *llvm::createGCNPreRaFixupsPass() {
+  return new GCNPreRaFixups();
+}
+
+bool GCNPreRaFixups::handleMAI(MachineInstr &MI) const {
+  if (!TII->requiresNoAliasingBetweenDstAndSrcC(MI) ||
+      MI.getOperand(0).isTied())
+    return false;
+
+  // SrcC shall not partially overlap with Dst. Either tie it to Dst if
+  // SrcC has a single use and we can overwrite it or set earlyclobber
+  // on the Dst to allocate a disjoint register otherwise.
+  auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+  Register Reg = Src2->getReg();
+  if (Reg.isVirtual() && MRI->hasOneUse(Reg)) {
+    MachineInstrBuilder(*MI.getMF(), &MI)
+      .addReg(Reg, RegState::Implicit, Src2->getSubReg());
+    MI.tieOperands(0, MI.getNumOperands() - 1);
+    return true;
+  }
+
+  MI.getOperand(0).setIsEarlyClobber();
+  return true;
+}
+
+bool GCNPreRaFixups::runOnMachineFunction(MachineFunction &MF) {
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+
+  if (!ST.hasMAIInsts())
+    return false;
+
+  TII = ST.getInstrInfo();
+  TRI = &TII->getRegisterInfo();
+  MRI = &MF.getRegInfo();
+
+  bool Changed = false;
+
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (SIInstrInfo::isMAI(MI))
+        Changed |= handleMAI(MI);
+    }
+  }
+
+  return Changed;
+}
Index: llvm/lib/Target/AMDGPU/SIInstrInfo.h
===================================================================
--- llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1132,6 +1132,11 @@
   static unsigned getDSShaderTypeValue(const MachineFunction &MF);
 
   const TargetSchedModel &getSchedModel() const { return SchedModel; }
+
+  /// An MFMA instruction reading more than 4 registers as SrcC shall have
+  /// SrcC either completely disjoint from the Dst or have SrcC exactly the
+  /// same as Dst. Partial overlap is not supported.
+  bool requiresNoAliasingBetweenDstAndSrcC(const MachineInstr &MI) const;
 };
 
 /// \brief Returns true if a reg:subreg pair P has a TRC class
Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -8268,3 +8268,17 @@
 
   return false;
 }
+
+bool
+SIInstrInfo::requiresNoAliasingBetweenDstAndSrcC(const MachineInstr &MI) const {
+  if (!isMAI(MI))
+    return false;
+  const MachineOperand *SrcC = getNamedOperand(MI, AMDGPU::OpName::src2);
+  if (!SrcC || !SrcC->isReg())
+    return false;
+  Register Reg = SrcC->getReg();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+  const TargetRegisterClass *RC = TRI->getRegClassForReg(MRI, Reg);
+  return RC->MC->getSizeInBits() > 128;
+}
Index: llvm/test/CodeGen/AMDGPU/acc-ldst.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/acc-ldst.ll
+++ llvm/test/CodeGen/AMDGPU/acc-ldst.ll
@@ -168,10 +168,13 @@
   ret void
 }
 
+; FIXME: there are v_accvgpr_read_b32 instructions in between of two mfma
+;        since the result is marked earlyclobber.
 ; GCN-LABEL: {{^}}test_multiuse_load_mfma_mfma_store:
 ; GCN-COUNT-8: global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
 ; GCN-NOT:     v_accvgpr_write
 ; GCN:         v_mfma_f32_32x32x1f32
+; GCN:         v_mfma_f32_32x32x1f32
 ; GCN-NOT:     v_accvgpr_read
 ; GCN-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}]
 define amdgpu_kernel void @test_multiuse_load_mfma_mfma_store(<32 x float> addrspace(1)* %arg) {
Index: llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -113,6 +113,7 @@
 ; GCN-O0-NEXT:        Virtual Register Map
 ; GCN-O0-NEXT:        Live Register Matrix
 ; GCN-O0-NEXT:        SI Pre-allocate WWM Registers
+; GCN-O0-NEXT:        GCN Pre-RA Fixups
 ; GCN-O0-NEXT:        Fast Register Allocator
 ; GCN-O0-NEXT:        SI lower SGPR spill instructions
 ; GCN-O0-NEXT:        Fast Register Allocator
@@ -308,6 +309,7 @@
 ; GCN-O1-NEXT:        Detect Dead Lanes
 ; GCN-O1-NEXT:        Remove dead machine instructions
 ; GCN-O1-NEXT:        Process Implicit Definitions
+; GCN-O1-NEXT:        GCN Pre-RA Fixups
 ; GCN-O1-NEXT:        Remove unreachable machine basic blocks
 ; GCN-O1-NEXT:        Live Variable Analysis
 ; GCN-O1-NEXT:        MachineDominator Tree Construction
@@ -593,6 +595,7 @@
 ; GCN-O1-OPTS-NEXT:        Detect Dead Lanes
 ; GCN-O1-OPTS-NEXT:        Remove dead machine instructions
 ; GCN-O1-OPTS-NEXT:        Process Implicit Definitions
+; GCN-O1-OPTS-NEXT:        GCN Pre-RA Fixups
 ; GCN-O1-OPTS-NEXT:        Remove unreachable machine basic blocks
 ; GCN-O1-OPTS-NEXT:        Live Variable Analysis
 ; GCN-O1-OPTS-NEXT:        SI Optimize VGPR LiveRange
@@ -879,6 +882,7 @@
 ; GCN-O2-NEXT:        Detect Dead Lanes
 ; GCN-O2-NEXT:        Remove dead machine instructions
 ; GCN-O2-NEXT:        Process Implicit Definitions
+; GCN-O2-NEXT:        GCN Pre-RA Fixups
 ; GCN-O2-NEXT:        Remove unreachable machine basic blocks
 ; GCN-O2-NEXT:        Live Variable Analysis
 ; GCN-O2-NEXT:        SI Optimize VGPR LiveRange
@@ -1179,6 +1183,7 @@
 ; GCN-O3-NEXT:        Detect Dead Lanes
 ; GCN-O3-NEXT:        Remove dead machine instructions
 ; GCN-O3-NEXT:        Process Implicit Definitions
+; GCN-O3-NEXT:        GCN Pre-RA Fixups
 ; GCN-O3-NEXT:        Remove unreachable machine basic blocks
 ; GCN-O3-NEXT:        Live Variable Analysis
 ; GCN-O3-NEXT:        SI Optimize VGPR LiveRange
Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -602,14 +602,15 @@
 
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_vecarg:
 ; GFX90A-DAG:      v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
-; GCN-DAG:         v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
+; GFX90A-DAG:      v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
 ; GCN-COUNT-8:     global_load_dwordx4
 ; GFX908-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG:      v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
 ; GFX90A-NOT:      v_accvgpr_write
+; GFX908-DAG:      v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
+; GFX908-DAG:      v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
 ; GFX908:          v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
 ; GFX90A:          v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GFX908-COUNT-32: v_accvgpr_read_b32
+; GFX908:          v_accvgpr_read_b32
 ; GFX908-COUNT-8:  global_store_dwordx4
 ; GFX90A-NOT:      v_accvgpr_read_b32
 ; GFX90A-COUNT-5:  global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}]
Index: llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
@@ -0,0 +1,66 @@
+; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY %s
+; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY %s
+; RUN: llc -march=amdgcn -mcpu=gfx90a -sgpr-regalloc=fast -vgpr-regalloc=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,FAST %s
+
+; Check that Dst and SrcC of MFMA instructions reading more than 4 registers as SrcC
+; is either completely disjoint or exactly the same, but does not alias.
+
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32)
+
+; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32:
+; GREEDY: v_mfma_f32_32x32x1f32 a[0:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:31]
+; GREEDY: v_mfma_f32_32x32x1f32 a[32:63], v{{[0-9]+}}, v{{[0-9]+}}, a[0:31]
+; FAST:   v_mfma_f32_32x32x1f32 a[64:95], v{{[0-9]+}}, v{{[0-9]+}}, a[64:95]
+; FAST:   v_mfma_f32_32x32x1f32 a[32:63], v{{[0-9]+}}, v{{[0-9]+}}, a[64:95]
+; GCN:    v_mfma_f32_32x32x1f32 a[0:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:31]
+define amdgpu_kernel void @test_mfma_f32_32x32x1f32(<32 x float> addrspace(1)* %arg) #0 {
+bb:
+  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
+  %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.1, i32 0, i32 0, i32 0)
+  %tmp.1 = shufflevector <32 x float> %mai.2, <32 x float> %mai.1, <32 x i32> <i32 32, i32 33, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29>
+  %mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %tmp.1, i32 0, i32 0, i32 0)
+  store <32 x float> %mai.3, <32 x float> addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32:
+; GREEDY: v_mfma_f32_16x16x1f32 a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15]
+; GREEDY: v_mfma_f32_16x16x1f32 a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15]
+; FAST:   v_mfma_f32_16x16x1f32 a[32:47], v{{[0-9]+}}, v{{[0-9]+}}, a[32:47]
+; FAST:   v_mfma_f32_16x16x1f32 a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[32:47]
+; GCN:    v_mfma_f32_16x16x1f32 a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15]
+define amdgpu_kernel void @test_mfma_f32_16x16x1f32(<16 x float> addrspace(1)* %arg) #0 {
+bb:
+  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 0, i32 0, i32 0)
+  %mai.2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %mai.1, i32 0, i32 0, i32 0)
+  %tmp.1 = shufflevector <16 x float> %mai.2, <16 x float> %mai.1, <16 x i32> <i32 16, i32 17, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13>
+  %mai.3 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %tmp.1, i32 0, i32 0, i32 0)
+  store <16 x float> %mai.3, <16 x float> addrspace(1)* %arg
+  ret void
+}
+
+; This instruction allows the overlap since it only read 4 registers.
+
+; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32:
+; GREEDY: v_mfma_f32_4x4x1f32 a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3]
+; GREEDY: v_mfma_f32_4x4x1f32 a[2:5], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3]
+; FAST:   v_mfma_f32_4x4x1f32 a[8:11], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3]
+; FAST:   v_mfma_f32_4x4x1f32 a[4:7], v{{[0-9]+}}, v{{[0-9]+}}, a[8:11]
+; GCN:    v_mfma_f32_4x4x1f32 a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3]
+define amdgpu_kernel void @test_mfma_f32_4x4x1f32(<4 x float> addrspace(1)* %arg) #0 {
+bb:
+  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 0, i32 0, i32 0)
+  %mai.2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %mai.1, i32 0, i32 0, i32 0)
+  %tmp.1 = shufflevector <4 x float> %mai.1, <4 x float> %mai.2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  %mai.3 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %tmp.1, i32 0, i32 0, i32 0)
+  store <4 x float> %mai.3, <4 x float> addrspace(1)* %arg
+  ret void
+}
+
+attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
Index: llvm/test/CodeGen/AMDGPU/schedule-xdl-resource.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/schedule-xdl-resource.ll
+++ llvm/test/CodeGen/AMDGPU/schedule-xdl-resource.ll
@@ -41,4 +41,4 @@
   ret void
 }
 
-attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" }
+attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="1,1" }
Index: llvm/test/CodeGen/AMDGPU/spill-agpr.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/spill-agpr.ll
+++ llvm/test/CodeGen/AMDGPU/spill-agpr.ll
@@ -1,37 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s
 
-; GCN-LABEL: {{^}}max_24regs_32a_used:
-; GCN-NOT:     s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
-; GCN-NOT:     s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
-; GCN-DAG:     v_mfma_f32_16x16x1f32
-; GCN-DAG:     v_mfma_f32_16x16x1f32
-; GCN-DAG:     v_accvgpr_read_b32
-; GCN-NOT:     buffer_store_dword
-; GCN-NOT:     buffer_load_dword
-; GFX908-NOT:  v_accvgpr_write_b32
-; GFX90A:      v_accvgpr_write_b32
-; GCN:         ScratchSize: 0
-define amdgpu_kernel void @max_24regs_32a_used(<16 x float> addrspace(1)* %arg, float addrspace(1)* %out) #0 {
-bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
-  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 1.0, <16 x float> %in.1, i32 0, i32 0, i32 0)
-  %mai.2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 1.0, <16 x float> %mai.1, i32 0, i32 0, i32 0)
-  %elt1 = extractelement <16 x float> %mai.2, i32 0
-  %elt2 = extractelement <16 x float> %mai.1, i32 15
-  %elt3 = extractelement <16 x float> %mai.1, i32 14
-  %elt4 = extractelement <16 x float> %mai.2, i32 1
-  store float %elt1, float addrspace(1)* %out
-  %gep1 = getelementptr float, float addrspace(1)* %out, i64 1
-  store float %elt2, float addrspace(1)* %gep1
-  %gep2 = getelementptr float, float addrspace(1)* %out, i64 2
-  store float %elt3, float addrspace(1)* %gep2
-  %gep3 = getelementptr float, float addrspace(1)* %out, i64 3
-  store float %elt4, float addrspace(1)* %gep3
-
-  ret void
-}
-
 ; GCN-LABEL: {{^}}max_12regs_13a_used:
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
@@ -152,7 +121,6 @@
 declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32)
 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
 
-attributes #0 = { nounwind "amdgpu-num-vgpr"="24" }
 attributes #1 = { nounwind "amdgpu-num-vgpr"="10" }
 attributes #2 = { nounwind "amdgpu-num-vgpr"="12" }
 attributes #3 = { nounwind "amdgpu-num-vgpr"="32" }