diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -111,7 +111,7 @@ std::pair isOMod(const MachineInstr &MI) const; bool tryFoldOMod(MachineInstr &MI); bool tryFoldRegSequence(MachineInstr &MI); - bool tryFoldLCSSAPhi(MachineInstr &MI); + bool tryFoldPhiAGPR(MachineInstr &MI); bool tryFoldLoad(MachineInstr &MI); public: @@ -1628,52 +1628,172 @@ return true; } -// Try to hoist an AGPR to VGPR copy out of the loop across a LCSSA PHI. +static Register tryFindExistingCopy(MachineRegisterInfo &MRI, + MachineInstr &Begin, Register FromReg, + const TargetRegisterClass *ToRC) { + MachineBasicBlock &MBB = *Begin.getParent(); + for (auto It = ++Begin.getIterator(); It != MBB.end(); ++It) { + if (!It->isCopy()) + continue; + + Register CopyOut = It->getOperand(0).getReg(); + Register CopyIn = It->getOperand(1).getReg(); + if (CopyIn == FromReg && + It->getOperand(1).getSubReg() == AMDGPU::NoSubRegister && + It->getOperand(0).getSubReg() == AMDGPU::NoSubRegister && + MRI.getRegClass(CopyOut) == ToRC) + return CopyOut; + } + + return Register(); +} + +// Try to hoist an AGPR to VGPR copy across a PHI. // This should allow folding of an AGPR into a consumer which may support it. -// I.e.: // -// loop: // loop: -// %1:vreg = COPY %0:areg // exit: -// exit: => // %1:areg = PHI %0:areg, %loop -// %2:vreg = PHI %1:vreg, %loop // %2:vreg = COPY %1:areg -bool SIFoldOperands::tryFoldLCSSAPhi(MachineInstr &PHI) { +// Example 1: LCSSA PHI +// loop: +// %1:vreg = COPY %0:areg +// exit: +// %2:vreg = PHI %1:vreg, %loop +// => +// loop: +// exit: +// %1:areg = PHI %0:areg, %loop +// %2:vreg = COPY %1:areg +// +// Example 2: PHI with multiple incoming values: +// entry: +// %1:vreg = GLOBAL_LOAD(..) +// loop: +// %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop +// %3:areg = COPY %2:vreg +// %4:areg = (instr using %3:areg) +// %5:vreg = COPY %4:areg +// => +// entry: +// %1:vreg = GLOBAL_LOAD(..) +// %2:areg = COPY %1:vreg +// loop: +// %3:areg = PHI %2:areg, %entry, %X:areg, +// %4:areg = (instr using %3:areg) +bool SIFoldOperands::tryFoldPhiAGPR(MachineInstr &PHI) { assert(PHI.isPHI()); - if (PHI.getNumExplicitOperands() != 3) // Single input LCSSA PHI - return false; - - Register PhiIn = PHI.getOperand(1).getReg(); Register PhiOut = PHI.getOperand(0).getReg(); - if (PHI.getOperand(1).getSubReg() || - !TRI->isVGPR(*MRI, PhiIn) || !TRI->isVGPR(*MRI, PhiOut)) + if (!TRI->isVGPR(*MRI, PhiOut)) return false; - // A single use should not matter for correctness, but if it has another use - // inside the loop we may perform copy twice in a worst case. - if (!MRI->hasOneNonDBGUse(PhiIn)) - return false; + // Look at all incoming values of the PHI. + const TargetRegisterClass *ARC = nullptr; + SmallVector PhiIncs; + for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) { + MachineOperand &MO = PHI.getOperand(K); + if (!MO.getReg()) + return false; - MachineInstr *Copy = MRI->getVRegDef(PhiIn); - if (!Copy || !Copy->isCopy()) - return false; + Register PhiIn = MO.getReg(); + if (MO.getSubReg() || !TRI->isVGPR(*MRI, PhiIn)) + return false; + + PhiIncs.push_back(&PHI.getOperand(K)); + MachineInstr *Copy = MRI->getVRegDef(PhiIn); + if (!Copy || !Copy->isCopy()) + continue; - Register CopyIn = Copy->getOperand(1).getReg(); - if (!TRI->isAGPR(*MRI, CopyIn) || Copy->getOperand(1).getSubReg()) + Register CopyIn = Copy->getOperand(1).getReg(); + if (TRI->isAGPR(*MRI, CopyIn)) { + const TargetRegisterClass *CopyInRC = MRI->getRegClass(CopyIn); + if (unsigned SubReg = Copy->getOperand(1).getSubReg()) + CopyInRC = TRI->getSubRegisterClass(CopyInRC, SubReg); + if (ARC && ARC != CopyInRC) + return false; + ARC = CopyInRC; + } + } + + if (!ARC) return false; - const TargetRegisterClass *ARC = MRI->getRegClass(CopyIn); + LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI); + + // Rewrite the PHI's incoming values to ARC. + for (MachineOperand *MO : PhiIncs) { + Register Reg = MO->getReg(); + + // Single-use COPY. + MachineInstr *Def = MRI->getVRegDef(Reg); + if (Def && Def->getOpcode() == AMDGPU::COPY && MRI->hasOneNonDBGUse(Reg)) { + MachineOperand &CopyInMO = Def->getOperand(1); + Register CopyIn = CopyInMO.getReg(); + + // Copies to ARC: Steal the operand, remove the COPY. + const TargetRegisterClass *CopyRC = MRI->getRegClass(CopyIn); + if (CopyInMO.getSubReg()) + CopyRC = TRI->getSubRegisterClass(CopyRC, CopyInMO.getSubReg()); + + if (CopyRC == ARC) { + MO->setReg(CopyInMO.getReg()); + MO->setSubReg(CopyInMO.getSubReg()); + LLVM_DEBUG(dbgs() << " Removing COPY: " << *Def); + Def->eraseFromParent(); + continue; + } + + // Copies to something else: Fix the COPY. + Register NewReg = MRI->createVirtualRegister(ARC); + LLVM_DEBUG(dbgs() << " Changing COPY: " << *Def); + Def->getOperand(0).setReg(NewReg); + LLVM_DEBUG(dbgs() << " -> " << *Def); + MO->setReg(NewReg); + continue; + } + + MachineBasicBlock::iterator InsertPt; + MachineBasicBlock *InsertMBB = nullptr; + if (Def) { + if (Register Reuse = tryFindExistingCopy(*MRI, *Def, Reg, ARC)) { + LLVM_DEBUG(dbgs() << " Reusing: " << *MRI->getVRegDef(Reuse)); + PHI.getOperand(MO->getOperandNo()).setReg(Reuse); + continue; + } + + InsertPt = ++Def->getIterator(); + InsertMBB = Def->getParent(); + } else { + InsertMBB = PHI.getOperand(MO->getOperandNo() + 1).getMBB(); + InsertPt = InsertMBB->getFirstTerminator(); + } + // TODO: Is using the PHI's DebugLoc okay here? + const unsigned RegState = (MRI->hasOneNonDBGUse(Reg) ? RegState::Kill : 0); + Register NewReg = MRI->createVirtualRegister(ARC); + MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(), + TII->get(AMDGPU::COPY), NewReg) + .addReg(Reg, RegState); + PHI.getOperand(MO->getOperandNo()).setReg(NewReg); + + (void)MI; + LLVM_DEBUG(dbgs() << " Created COPY: " << *MI); + } + + // Replace the PHI's result with a new register. Register NewReg = MRI->createVirtualRegister(ARC); - PHI.getOperand(1).setReg(CopyIn); PHI.getOperand(0).setReg(NewReg); + // COPY that new register back to the original PhiOut register. This COPY can + // be folded out later. + // + // TODO: What if the COPY isn't folded out, could this fold make things worse? + // Should we check that the PHI is only used by a COPY to areg when it's not + // an LCSSA PHI (PHI with one incoming value/block)? + // + // TODO: Is using the PHI's DebugLoc okay here? MachineBasicBlock *MBB = PHI.getParent(); - BuildMI(*MBB, MBB->getFirstNonPHI(), Copy->getDebugLoc(), + BuildMI(*MBB, MBB->getFirstNonPHI(), PHI.getDebugLoc(), TII->get(AMDGPU::COPY), PhiOut) - .addReg(NewReg, RegState::Kill); - Copy->eraseFromParent(); // We know this copy had a single use. - - LLVM_DEBUG(dbgs() << "Folded " << PHI); + .addReg(NewReg, RegState::Kill); + LLVM_DEBUG(dbgs() << " Done: Folded " << PHI); return true; } @@ -1766,7 +1886,7 @@ continue; } - if (MI.isPHI() && tryFoldLCSSAPhi(MI)) { + if (MI.isPHI() && tryFoldPhiAGPR(MI)) { Changed = true; continue; } diff --git a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll --- a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll +++ b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-codegenprepare-break-large-phis=0 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll --- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-codegenprepare-break-large-phis=0 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,GFX908_A %s -; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-codegenprepare-break-large-phis=0 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,GFX908_A,GFX940_A %s -; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -amdgpu-codegenprepare-break-large-phis=0 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX940,GFX940_A %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,GFX908_A %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,GFX908_A,GFX940_A %s +; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX940,GFX940_A %s ; GCN-LABEL: {{^}}test_mfma_loop_zeroinit: @@ -84,10 +84,8 @@ ; GCN-LABEL: {{^}}test_mfma_loop_non_splat: -; GCN: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0{{$}} +; GCN-COUNT-31: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0{{$}} -; GFX908-COUNT-30: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GFX90A-COUNT-30: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]{{$}} ; GCN: [[LOOP:.LBB[0-9_]+]]: ; GCN-NOT: v_accvgpr @@ -120,73 +118,102 @@ ; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_seq: -; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only. -; 3 vgprs are needed to avoid wait states between writes. - -; GFX908: v_mov_b32_e32 [[TMP1:v[0-9]+]], 0x42f60000 -; GFX908: v_mov_b32_e32 [[TMP2:v[0-9]+]], 0x42f80000 -; GFX908: v_mov_b32_e32 [[TMP3:v[0-9]+]], 0x42fe0000 -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] +; Check that we do not use 32 temp vgprs + +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x42f80000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x42fa0000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x42fc0000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x42fe0000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43000000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43010000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43020000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43030000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43040000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43050000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43060000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43070000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43080000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43090000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x430a0000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x430b0000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x430c0000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x430d0000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x430e0000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x430f0000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43100000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43110000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43120000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43130000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43140000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43150000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43160000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43170000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43180000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43190000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x431a0000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 ; GFX940_A-COUNT-32: s_mov_b32 s{{[0-9]+}}, 0x4{{[0-9a-f]+}} ; GFX940_A-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}