diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -697,8 +697,10 @@ unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::V_ACCVGPR_READ_B32; - return BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst) - .addReg(Src, getKillRegState(IsKill)); + auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst) + .addReg(Src, getKillRegState(IsKill)); + MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); + return MIB; } // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not @@ -871,10 +873,12 @@ RS->setRegUsed(TmpReg); } if (IsStore) { - auto AccRead = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_READ_B32), TmpReg) + auto AccRead = BuildMI(*MBB, MI, DL, + TII->get(AMDGPU::V_ACCVGPR_READ_B32), TmpReg) .addReg(SubReg, getKillRegState(IsKill)); if (NeedSuperRegDef) AccRead.addReg(ValueReg, RegState::ImplicitDefine); + AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse); } SubReg = TmpReg; } @@ -908,10 +912,12 @@ if (!IsAGPR && NeedSuperRegDef) MIB.addReg(ValueReg, RegState::ImplicitDefine); - if (!IsStore && TmpReg != AMDGPU::NoRegister) + if (!IsStore && TmpReg != AMDGPU::NoRegister) { MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32), FinalReg) .addReg(TmpReg, RegState::Kill); + MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); + } } else { if (NeedSuperRegDef) MIB.addReg(ValueReg, RegState::ImplicitDefine); diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll --- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll @@ -5,10 +5,10 @@ ; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 ; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 ; A2V-NOT: SCRATCH_RSRC -; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 +; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 ; Reload Reuse ; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill ; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse ; A2V: ScratchSize: 0 define amdgpu_kernel void @max_24regs_32a_used(<16 x float> addrspace(1)* %arg, float addrspace(1)* %out) #0 { bb: @@ -34,10 +34,10 @@ ; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 ; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 ; A2V-NOT: SCRATCH_RSRC -; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} +; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} ; Reload Reuse ; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill ; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload -; A2V: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] +; A2V: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse ; A2V: ScratchSize: 0 define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, <4 x float> addrspace(1)* %arg, <4 x float> addrspace(1)* %out) #2 { bb: @@ -55,8 +55,7 @@ st: %gep1 = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i64 16 %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i64 32 - store <4 x float> %mai.1, <4 x float> addrspace(1)* %gep1 - store <4 x float> %mai.2, <4 x float> addrspace(1)* %gep2 + call void asm sideeffect "", "a,a"(<4 x float> %mai.1, <4 x float> %mai.2) ret void } @@ -65,25 +64,20 @@ ; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 ; A2V-NOT: SCRATCH_RSRC -; A2V: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} -; A2V: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] +; A2V: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} ; Reload Reuse +; A2V: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse ; A2V: ScratchSize: 0 ; A2M: buffer_store_dword v[[VSPILLSTORE:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill ; A2M: buffer_load_dword v[[VSPILL_RELOAD:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload -; A2M: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL_RELOAD]] +; A2M: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL_RELOAD]] ; Reload Reuse define amdgpu_kernel void @max_10_vgprs_used_9a() #1 { - %v0 = load volatile i32, i32 addrspace(3)* undef - %v1 = load volatile i32, i32 addrspace(3)* undef - %v2 = load volatile i32, i32 addrspace(3)* undef - %v3 = load volatile i32, i32 addrspace(3)* undef - %v4 = load volatile i32, i32 addrspace(3)* undef - %v5 = load volatile i32, i32 addrspace(3)* undef - %v6 = load volatile i32, i32 addrspace(3)* undef - %v7 = load volatile i32, i32 addrspace(3)* undef - call void asm sideeffect "", "a,a,a,a,~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6}"(i32 %v0, i32 %v1, i32 %v2, i32 %v3) - %v8 = load volatile i32, i32 addrspace(3)* undef - call void asm sideeffect "", "a,a,a,a,a"(i32 %v4, i32 %v5, i32 %v6, i32 %v7, i32 %v8) + %a1 = call <4 x i32> asm sideeffect "", "=a"() + %a2 = call <4 x i32> asm sideeffect "", "=a"() + %a3 = call i32 asm sideeffect "", "=a"() + %a4 = call <2 x i32> asm sideeffect "", "=a"() + call void asm sideeffect "", "a,a,a"(<4 x i32> %a1, <4 x i32> %a2, i32 %a3) + call void asm sideeffect "", "a"(<2 x i32> %a4) ret void } @@ -91,10 +85,10 @@ ; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 ; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 ; A2V-NOT: SCRATCH_RSRC -; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 +; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 ; Reload Reuse ; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill ; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse ; A2V: ScratchSize: 0 define amdgpu_kernel void @max_32regs_mfma32(float addrspace(1)* %arg) #3 { bb: @@ -115,6 +109,6 @@ declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) attributes #0 = { nounwind "amdgpu-num-vgpr"="24" } -attributes #1 = { nounwind "amdgpu-num-vgpr"="8" } +attributes #1 = { nounwind "amdgpu-num-vgpr"="10" } attributes #2 = { nounwind "amdgpu-num-vgpr"="12" } attributes #3 = { nounwind "amdgpu-num-vgpr"="32" } diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll --- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll @@ -5,15 +5,15 @@ ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 ; GFX908-NOT: SCRATCH_RSRC -; GFX908-DAG: v_accvgpr_write_b32 a0, v{{[0-9]}} -; GFX908-DAG: v_accvgpr_write_b32 a1, v{{[0-9]}} +; GFX908-DAG: v_accvgpr_write_b32 a0, v{{[0-9]}} ; Reload Reuse +; GFX908-DAG: v_accvgpr_write_b32 a1, v{{[0-9]}} ; Reload Reuse ; GFX900: buffer_store_dword v{{[0-9]}}, ; GFX900: buffer_store_dword v{{[0-9]}}, ; GFX900: buffer_load_dword v{{[0-9]}}, ; GFX900: buffer_load_dword v{{[0-9]}}, ; GFX908-NOT: buffer_ -; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a0 -; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a1 +; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a0 ; Reload Reuse +; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a1 ; Reload Reuse ; GCN: NumVgprs: 10 ; GFX900: ScratchSize: 12 @@ -59,10 +59,10 @@ ; GCN-LABEL: {{^}}max_10_vgprs_used_9a: ; GFX908-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 ; GFX908-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 -; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}} +; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}} ; Reload Reuse ; GFX908: buffer_store_dword v{{[0-9]}}, ; GFX908-NOT: buffer_ -; GFX908: v_accvgpr_read_b32 v{{[0-9]}}, a9 +; GFX908: v_accvgpr_read_b32 v{{[0-9]}}, a9 ; Reload Reuse ; GFX908: buffer_load_dword v{{[0-9]}}, ; GFX908-NOT: buffer_ @@ -113,28 +113,28 @@ ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 ; GFX908-DAG: v_accvgpr_write_b32 a0, 1 -; GFX908-DAG: v_accvgpr_write_b32 a1, v{{[0-9]}} -; GFX908-DAG: v_accvgpr_write_b32 a2, v{{[0-9]}} -; GFX908-DAG: v_accvgpr_write_b32 a3, v{{[0-9]}} -; GFX908-DAG: v_accvgpr_write_b32 a4, v{{[0-9]}} -; GFX908-DAG: v_accvgpr_write_b32 a5, v{{[0-9]}} -; GFX908-DAG: v_accvgpr_write_b32 a6, v{{[0-9]}} -; GFX908-DAG: v_accvgpr_write_b32 a7, v{{[0-9]}} -; GFX908-DAG: v_accvgpr_write_b32 a8, v{{[0-9]}} -; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}} +; GFX908-DAG: v_accvgpr_write_b32 a1, v{{[0-9]}} ; Reload Reuse +; GFX908-DAG: v_accvgpr_write_b32 a2, v{{[0-9]}} ; Reload Reuse +; GFX908-DAG: v_accvgpr_write_b32 a3, v{{[0-9]}} ; Reload Reuse +; GFX908-DAG: v_accvgpr_write_b32 a4, v{{[0-9]}} ; Reload Reuse +; GFX908-DAG: v_accvgpr_write_b32 a5, v{{[0-9]}} ; Reload Reuse +; GFX908-DAG: v_accvgpr_write_b32 a6, v{{[0-9]}} ; Reload Reuse +; GFX908-DAG: v_accvgpr_write_b32 a7, v{{[0-9]}} ; Reload Reuse +; GFX908-DAG: v_accvgpr_write_b32 a8, v{{[0-9]}} ; Reload Reuse +; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}} ; Reload Reuse ; GFX900: buffer_store_dword v{{[0-9]}}, ; GCN-DAG: buffer_store_dword v{{[0-9]}}, ; GFX900: buffer_load_dword v{{[0-9]}}, ; GCN-DAG: buffer_load_dword v{{[0-9]}}, -; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a1 -; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a2 -; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a3 -; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a4 -; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a5 -; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a6 -; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a7 -; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a8 -; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a9 +; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a1 ; Reload Reuse +; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a2 ; Reload Reuse +; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a3 ; Reload Reuse +; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a4 ; Reload Reuse +; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a5 ; Reload Reuse +; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a6 ; Reload Reuse +; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a7 ; Reload Reuse +; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a8 ; Reload Reuse +; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a9 ; Reload Reuse ; GCN: NumVgprs: 10 ; GFX900: ScratchSize: 44 @@ -166,8 +166,8 @@ ; GCN-LABEL: {{^}}max_10_vgprs_spill_v32: ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 -; GFX908-DAG: v_accvgpr_write_b32 a0, -; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}} +; GFX908-DAG: v_accvgpr_write_b32 a0, v{{[0-9]}} ; Reload Reuse +; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}} ; Reload Reuse ; GCN-NOT: a10 ; GCN: buffer_store_dword v{{[0-9]}},