diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -111,7 +111,7 @@ std::pair isOMod(const MachineInstr &MI) const; bool tryFoldOMod(MachineInstr &MI); bool tryFoldRegSequence(MachineInstr &MI); - bool tryFoldLCSSAPhi(MachineInstr &MI); + bool tryFoldPhiAGPR(MachineInstr &MI); bool tryFoldLoad(MachineInstr &MI); public: @@ -138,6 +138,16 @@ char &llvm::SIFoldOperandsID = SIFoldOperands::ID; +static const TargetRegisterClass *getRegOpRC(const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI, + const MachineOperand &MO) { + const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg()); + if (const TargetRegisterClass *SubRC = + TRI.getSubRegisterClass(RC, MO.getSubReg())) + RC = SubRC; + return RC; +} + // Map multiply-accumulate opcode to corresponding multiply-add opcode if any. static unsigned macToMad(unsigned Opc) { switch (Opc) { @@ -703,9 +713,13 @@ if (UseMI->isCopy() && OpToFold.isReg() && UseMI->getOperand(0).getReg().isVirtual() && !UseMI->getOperand(1).getSubReg()) { - LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI); unsigned Size = TII->getOpSize(*UseMI, 1); Register UseReg = OpToFold.getReg(); + // Don't fold SGPR -> AGPR copies. + if (TRI->isSGPRReg(*MRI, UseReg) && + TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg())) + return; + LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI); UseMI->getOperand(1).setReg(UseReg); UseMI->getOperand(1).setSubReg(OpToFold.getSubReg()); UseMI->getOperand(1).setIsKill(false); @@ -1628,52 +1642,118 @@ return true; } -// Try to hoist an AGPR to VGPR copy out of the loop across a LCSSA PHI. +// Try to hoist an AGPR to VGPR copy across a PHI. // This should allow folding of an AGPR into a consumer which may support it. -// I.e.: // -// loop: // loop: -// %1:vreg = COPY %0:areg // exit: -// exit: => // %1:areg = PHI %0:areg, %loop -// %2:vreg = PHI %1:vreg, %loop // %2:vreg = COPY %1:areg -bool SIFoldOperands::tryFoldLCSSAPhi(MachineInstr &PHI) { +// Example 1: LCSSA PHI +// loop: +// %1:vreg = COPY %0:areg +// exit: +// %2:vreg = PHI %1:vreg, %loop +// => +// loop: +// exit: +// %1:areg = PHI %0:areg, %loop +// %2:vreg = COPY %1:areg +// +// Example 2: PHI with multiple incoming values: +// entry: +// %1:vreg = GLOBAL_LOAD(..) +// loop: +// %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop +// %3:areg = COPY %2:vreg +// %4:areg = (instr using %3:areg) +// %5:vreg = COPY %4:areg +// => +// entry: +// %1:vreg = GLOBAL_LOAD(..) +// %2:areg = COPY %1:vreg +// loop: +// %3:areg = PHI %2:areg, %entry, %X:areg, +// %4:areg = (instr using %3:areg) +bool SIFoldOperands::tryFoldPhiAGPR(MachineInstr &PHI) { assert(PHI.isPHI()); - if (PHI.getNumExplicitOperands() != 3) // Single input LCSSA PHI - return false; - - Register PhiIn = PHI.getOperand(1).getReg(); Register PhiOut = PHI.getOperand(0).getReg(); - if (PHI.getOperand(1).getSubReg() || - !TRI->isVGPR(*MRI, PhiIn) || !TRI->isVGPR(*MRI, PhiOut)) + if (!TRI->isVGPR(*MRI, PhiOut)) return false; - // A single use should not matter for correctness, but if it has another use - // inside the loop we may perform copy twice in a worst case. - if (!MRI->hasOneNonDBGUse(PhiIn)) - return false; + // Iterate once over all incoming values of the PHI to check if this PHI is + // eligible, and determine the exact AGPR RC we'll target. + const TargetRegisterClass *ARC = nullptr; + for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) { + MachineOperand &MO = PHI.getOperand(K); - MachineInstr *Copy = MRI->getVRegDef(PhiIn); - if (!Copy || !Copy->isCopy()) - return false; + Register PhiIn = MO.getReg(); + if (MO.getSubReg() || !TRI->isVGPR(*MRI, PhiIn)) + return false; + + MachineInstr *Copy = MRI->getVRegDef(PhiIn); + if (!Copy || !Copy->isCopy()) + continue; + + Register CopyIn = Copy->getOperand(1).getReg(); + if (TRI->isAGPR(*MRI, CopyIn)) { + const TargetRegisterClass *CopyInRC = + getRegOpRC(*MRI, *TRI, Copy->getOperand(1)); + if (ARC && ARC != CopyInRC) + return false; + ARC = CopyInRC; + } + } - Register CopyIn = Copy->getOperand(1).getReg(); - if (!TRI->isAGPR(*MRI, CopyIn) || Copy->getOperand(1).getSubReg()) + if (!ARC) return false; - const TargetRegisterClass *ARC = MRI->getRegClass(CopyIn); + // Rewrite the PHI's incoming values to ARC. + LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI); + for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) { + MachineOperand &MO = PHI.getOperand(K); + Register Reg = MO.getReg(); + + MachineBasicBlock::iterator InsertPt; + MachineBasicBlock *InsertMBB = nullptr; + + // Look at the def of Reg, ignoring all copies. + if (MachineInstr *Def = MRI->getVRegDef(Reg)) { + + // Look at pre-existing COPY instructions from ARC: Steal the operand. If + // the copy was single-use, it will be removed by DCE later. + if (Def->isCopy() && getRegOpRC(*MRI, *TRI, Def->getOperand(1)) == ARC) { + MO.setReg(Def->getOperand(1).getReg()); + MO.setSubReg(Def->getOperand(1).getSubReg()); + continue; + } + + InsertPt = ++Def->getIterator(); + InsertMBB = Def->getParent(); + } else { + InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB(); + InsertPt = InsertMBB->getFirstTerminator(); + } + + Register NewReg = MRI->createVirtualRegister(ARC); + MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(), + TII->get(AMDGPU::COPY), NewReg) + .addReg(Reg); + MO.setReg(NewReg); + + (void)MI; + LLVM_DEBUG(dbgs() << " Created COPY: " << *MI); + } + + // Replace the PHI's result with a new register. Register NewReg = MRI->createVirtualRegister(ARC); - PHI.getOperand(1).setReg(CopyIn); PHI.getOperand(0).setReg(NewReg); + // COPY that new register back to the original PhiOut register. This COPY will + // usually be folded out later. MachineBasicBlock *MBB = PHI.getParent(); - BuildMI(*MBB, MBB->getFirstNonPHI(), Copy->getDebugLoc(), + BuildMI(*MBB, MBB->getFirstNonPHI(), PHI.getDebugLoc(), TII->get(AMDGPU::COPY), PhiOut) - .addReg(NewReg, RegState::Kill); - Copy->eraseFromParent(); // We know this copy had a single use. - - LLVM_DEBUG(dbgs() << "Folded " << PHI); + .addReg(NewReg); + LLVM_DEBUG(dbgs() << " Done: Folded " << PHI); return true; } @@ -1766,7 +1846,7 @@ continue; } - if (MI.isPHI() && tryFoldLCSSAPhi(MI)) { + if (MI.isPHI() && tryFoldPhiAGPR(MI)) { Changed = true; continue; } diff --git a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll --- a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll +++ b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-codegenprepare-break-large-phis=0 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll --- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-codegenprepare-break-large-phis=0 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,GFX908_A %s -; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-codegenprepare-break-large-phis=0 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,GFX908_A,GFX940_A %s -; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -amdgpu-codegenprepare-break-large-phis=0 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX940,GFX940_A %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,GFX908_A %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,GFX908_A,GFX940_A %s +; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX940,GFX940_A %s ; GCN-LABEL: {{^}}test_mfma_loop_zeroinit: @@ -84,10 +84,8 @@ ; GCN-LABEL: {{^}}test_mfma_loop_non_splat: -; GCN: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0{{$}} +; GCN-COUNT-31: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0{{$}} -; GFX908-COUNT-30: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GFX90A-COUNT-30: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]{{$}} ; GCN: [[LOOP:.LBB[0-9_]+]]: ; GCN-NOT: v_accvgpr @@ -120,73 +118,102 @@ ; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_seq: -; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only. -; 3 vgprs are needed to avoid wait states between writes. - -; GFX908: v_mov_b32_e32 [[TMP1:v[0-9]+]], 0x42f60000 -; GFX908: v_mov_b32_e32 [[TMP2:v[0-9]+]], 0x42f80000 -; GFX908: v_mov_b32_e32 [[TMP3:v[0-9]+]], 0x42fe0000 -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] +; Check that we do not use 32 temp vgprs + +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x42f80000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x42fa0000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x42fc0000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x42fe0000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43000000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43010000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43020000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43030000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43040000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43050000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43060000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43070000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43080000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43090000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x430a0000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x430b0000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x430c0000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x430d0000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x430e0000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x430f0000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43100000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43110000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43120000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43130000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43140000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43150000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43160000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43170000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43180000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43190000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x431a0000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 ; GFX940_A-COUNT-32: s_mov_b32 s{{[0-9]+}}, 0x4{{[0-9a-f]+}} ; GFX940_A-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}