Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4576,6 +4576,10 @@ VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) ? RI.getEquivalentAGPRClass(SRC) : RI.getEquivalentVGPRClass(SRC); + } else { + VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) + ? RI.getEquivalentAGPRClass(VRC) + : RI.getEquivalentVGPRClass(VRC); } RC = VRC; } else { Index: llvm/test/CodeGen/AMDGPU/mfma-loop.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/mfma-loop.ll +++ llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -1,13 +1,64 @@ ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}test_mfma_loop_zeroinit: -; GCN-COUNT32: v_accvgpr_write_b32 + +; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only. +; 3 vgprs are needed to avoid wait states between writes. + +; FIXME: We should not be using and temporary registers at all. +; At the moment we initialize an sgpr, then copy it via vgprs. + +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2:v[0-9]+]] +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3:v[0-9]+]] + +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1:v[0-9]+]] +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] + +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] + +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] + +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] + +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] + +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] + +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] + +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] + +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] + +; Check that we do not copy agprs to vgprs and back inside the loop. + ; GCN: [[LOOP:BB[0-9_]+]]: ; GCN-NOT: v_accvgpr ; GCN: v_mfma_f32_32x32x1f32 ; GCN-NOT: v_accvgpr ; GCN: s_cbranch_scc1 [[LOOP]] + +; Final result should be read only once after the loop. + ; GCN-COUNT32: v_accvgpr_read_b32 + define amdgpu_kernel void @test_mfma_loop_zeroinit(<32 x float> addrspace(1)* %arg) { entry: br label %for.cond.preheader