Index: lib/Target/AMDGPU/SIFrameLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIFrameLowering.cpp +++ lib/Target/AMDGPU/SIFrameLowering.cpp @@ -353,7 +353,8 @@ if (OffsetRegUsed && PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) { BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) - .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); + .addReg(PreloadedScratchWaveOffsetReg, + MRI.isPhysRegUsed(ScratchWaveOffsetReg) ? 0 : RegState::Kill); } if (CopyBuffer && !CopyBufferFirst) { Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -1042,6 +1042,7 @@ static void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, + CallingConv::ID CallConv, bool IsShader) { if (Info.hasWorkGroupIDX()) { unsigned Reg = Info.addWorkGroupIDX(); @@ -1072,7 +1073,11 @@ unsigned PrivateSegmentWaveByteOffsetReg; if (IsShader) { - PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo); + if (Info.hasPrivateSegmentWaveByteOffsetInSGPR5()) + PrivateSegmentWaveByteOffsetReg = AMDGPU::SGPR5; + else + PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo); + Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg); } else PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset(); @@ -1310,7 +1315,7 @@ // Start adding system SGPRs. if (IsEntryFunc) - allocateSystemSGPRs(CCInfo, MF, *Info, IsShader); + allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader); reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info); Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -171,6 +171,7 @@ bool WorkGroupIDZ : 1; bool WorkGroupInfo : 1; bool PrivateSegmentWaveByteOffset : 1; + bool PrivateSegmentWaveByteOffsetInSGPR5 : 1; bool WorkItemIDX : 1; // Always initialized. bool WorkItemIDY : 1; @@ -329,6 +330,10 @@ return PrivateSegmentWaveByteOffset; } + bool hasPrivateSegmentWaveByteOffsetInSGPR5() const { + return PrivateSegmentWaveByteOffsetInSGPR5; + } + bool hasWorkItemIDX() const { return WorkItemIDX; } Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -71,6 +71,7 @@ WorkGroupIDZ(false), WorkGroupInfo(false), PrivateSegmentWaveByteOffset(false), + PrivateSegmentWaveByteOffsetInSGPR5(false), WorkItemIDX(false), WorkItemIDY(false), WorkItemIDZ(false), @@ -122,9 +123,15 @@ bool MaySpill = ST.isVGPRSpillingEnabled(*F); bool HasStackObjects = FrameInfo.hasStackObjects(); - if (HasStackObjects || MaySpill) + if (HasStackObjects || MaySpill) { PrivateSegmentWaveByteOffset = true; + // HS and GS always have the scratch wave offset in SGPR5 on GFX9. + PrivateSegmentWaveByteOffsetInSGPR5 = + ST.getGeneration() >= AMDGPUSubtarget::GFX9 && + (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS); + } + if (ST.isAmdCodeObjectV2(MF)) { if (HasStackObjects || MaySpill) PrivateSegmentBuffer = true; Index: test/CodeGen/AMDGPU/local-stack-slot-bug.ll =================================================================== --- test/CodeGen/AMDGPU/local-stack-slot-bug.ll +++ /dev/null @@ -1,26 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck %s - -; This used to fail due to a v_add_i32 instruction with an illegal immediate -; operand that was created during Local Stack Slot Allocation. Test case derived -; from https://bugs.freedesktop.org/show_bug.cgi?id=96602 -; -; CHECK-LABEL: {{^}}main: - -; CHECK-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x200 -; CHECK-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0x400{{$}} -; CHECK-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0 -; CHECK-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]] - -; CHECK-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], [[CLAMP_IDX]], [[K]] -; CHECK-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], [[CLAMP_IDX]], [[ZERO]] - -; CHECK: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen -; CHECK: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen -define amdgpu_ps float @main(i32 %idx) { -main_body: - %v1 = extractelement <81 x float> , i32 %idx - %v2 = extractelement <81 x float> , i32 %idx - %r = fadd float %v1, %v2 - ret float %r -} Index: test/CodeGen/AMDGPU/scratch-simple.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/scratch-simple.ll @@ -0,0 +1,103 @@ +; RUN: llc -march=amdgcn -mcpu=verde -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX9 %s + +; This used to fail due to a v_add_i32 instruction with an illegal immediate +; operand that was created during Local Stack Slot Allocation. Test case derived +; from https://bugs.freedesktop.org/show_bug.cgi?id=96602 +; +; GCN-LABEL: {{^}}ps_main: + +; GCN-DAG: s_mov_b32 [[SWO:s[0-9]+]], s0 +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x200 +; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0x400{{$}} +; GCN-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0 +; GCN-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]] + +; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], [[CLAMP_IDX]], [[K]] +; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], [[CLAMP_IDX]], [[ZERO]] + +; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +define amdgpu_ps float @ps_main(i32 %idx) { + %v1 = extractelement <81 x float> , i32 %idx + %v2 = extractelement <81 x float> , i32 %idx + %r = fadd float %v1, %v2 + ret float %r +} + +; GCN-LABEL: {{^}}vs_main: +; GCN: s_mov_b32 [[SWO:s[0-9]+]], s0 +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +define amdgpu_vs float @vs_main(i32 %idx) { + %v1 = extractelement <81 x float> , i32 %idx + %v2 = extractelement <81 x float> , i32 %idx + %r = fadd float %v1, %v2 + ret float %r +} + +; GCN-LABEL: {{^}}cs_main: +; GCN: s_mov_b32 [[SWO:s[0-9]+]], s0 +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +define amdgpu_cs float @cs_main(i32 %idx) { + %v1 = extractelement <81 x float> , i32 %idx + %v2 = extractelement <81 x float> , i32 %idx + %r = fadd float %v1, %v2 + ret float %r +} + +; GCN-LABEL: {{^}}hs_main: +; SI: s_mov_b32 [[SWO:s[0-9]+]], s0 +; GFX9: s_mov_b32 [[SWO:s[0-9]+]], s5 +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +define amdgpu_hs float @hs_main(i32 %idx) { + %v1 = extractelement <81 x float> , i32 %idx + %v2 = extractelement <81 x float> , i32 %idx + %r = fadd float %v1, %v2 + ret float %r +} + +; GCN-LABEL: {{^}}gs_main: +; SI: s_mov_b32 [[SWO:s[0-9]+]], s0 +; GFX9: s_mov_b32 [[SWO:s[0-9]+]], s5 +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +define amdgpu_gs float @gs_main(i32 %idx) { + %v1 = extractelement <81 x float> , i32 %idx + %v2 = extractelement <81 x float> , i32 %idx + %r = fadd float %v1, %v2 + ret float %r +} + +; GCN-LABEL: {{^}}hs_ir_uses_scratch_offset: +; SI: s_mov_b32 [[SWO:s[0-9]+]], s6 +; GFX9: s_mov_b32 [[SWO:s[0-9]+]], s5 +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +; GCN: s_mov_b32 s2, s5 +define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) { + %v1 = extractelement <81 x float> , i32 %idx + %v2 = extractelement <81 x float> , i32 %idx + %f = fadd float %v1, %v2 + %r1 = insertvalue <{i32, i32, i32, float}> undef, i32 %swo, 2 + %r2 = insertvalue <{i32, i32, i32, float}> %r1, float %f, 3 + ret <{i32, i32, i32, float}> %r2 +} + +; GCN-LABEL: {{^}}gs_ir_uses_scratch_offset: +; SI: s_mov_b32 [[SWO:s[0-9]+]], s6 +; GFX9: s_mov_b32 [[SWO:s[0-9]+]], s5 +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +; GCN: s_mov_b32 s2, s5 +define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) { + %v1 = extractelement <81 x float> , i32 %idx + %v2 = extractelement <81 x float> , i32 %idx + %f = fadd float %v1, %v2 + %r1 = insertvalue <{i32, i32, i32, float}> undef, i32 %swo, 2 + %r2 = insertvalue <{i32, i32, i32, float}> %r1, float %f, 3 + ret <{i32, i32, i32, float}> %r2 +}