Index: llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -353,7 +353,8 @@ if (OffsetRegUsed && PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) { BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) - .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); + .addReg(PreloadedScratchWaveOffsetReg, + MRI.isPhysRegUsed(ScratchWaveOffsetReg) ? 0 : RegState::Kill); } if (CopyBuffer && !CopyBufferFirst) { Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1042,6 +1042,7 @@ static void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, + CallingConv::ID CallConv, bool IsShader) { if (Info.hasWorkGroupIDX()) { unsigned Reg = Info.addWorkGroupIDX(); @@ -1072,8 +1073,15 @@ unsigned PrivateSegmentWaveByteOffsetReg; if (IsShader) { - PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo); - Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg); + PrivateSegmentWaveByteOffsetReg = + Info.getPrivateSegmentWaveByteOffsetSystemSGPR(); + + // This is true if the scratch wave byte offset doesn't have a fixed + // location. + if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) { + PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo); + Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg); + } } else PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset(); @@ -1310,7 +1318,7 @@ // Start adding system SGPRs. if (IsEntryFunc) - allocateSystemSGPRs(CCInfo, MF, *Info, IsShader); + allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader); reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info); Index: llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -122,9 +122,15 @@ bool MaySpill = ST.isVGPRSpillingEnabled(*F); bool HasStackObjects = FrameInfo.hasStackObjects(); - if (HasStackObjects || MaySpill) + if (HasStackObjects || MaySpill) { PrivateSegmentWaveByteOffset = true; + // HS and GS always have the scratch wave offset in SGPR5 on GFX9. + if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 && + (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS)) + PrivateSegmentWaveByteOffsetSystemSGPR = AMDGPU::SGPR5; + } + if (ST.isAmdCodeObjectV2(MF)) { if (HasStackObjects || MaySpill) PrivateSegmentBuffer = true; Index: llvm/trunk/test/CodeGen/AMDGPU/local-stack-slot-bug.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/local-stack-slot-bug.ll +++ llvm/trunk/test/CodeGen/AMDGPU/local-stack-slot-bug.ll @@ -1,26 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck %s - -; This used to fail due to a v_add_i32 instruction with an illegal immediate -; operand that was created during Local Stack Slot Allocation. Test case derived -; from https://bugs.freedesktop.org/show_bug.cgi?id=96602 -; -; CHECK-LABEL: {{^}}main: - -; CHECK-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x200 -; CHECK-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0x400{{$}} -; CHECK-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0 -; CHECK-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]] - -; CHECK-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], [[CLAMP_IDX]], [[K]] -; CHECK-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], [[CLAMP_IDX]], [[ZERO]] - -; CHECK: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen -; CHECK: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen -define amdgpu_ps float @main(i32 %idx) { -main_body: - %v1 = extractelement <81 x float> , i32 %idx - %v2 = extractelement <81 x float> , i32 %idx - %r = fadd float %v1, %v2 - ret float %r -} Index: llvm/trunk/test/CodeGen/AMDGPU/scratch-simple.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/scratch-simple.ll +++ llvm/trunk/test/CodeGen/AMDGPU/scratch-simple.ll @@ -0,0 +1,103 @@ +; RUN: llc -march=amdgcn -mcpu=verde -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX9 %s + +; This used to fail due to a v_add_i32 instruction with an illegal immediate +; operand that was created during Local Stack Slot Allocation. Test case derived +; from https://bugs.freedesktop.org/show_bug.cgi?id=96602 +; +; GCN-LABEL: {{^}}ps_main: + +; GCN-DAG: s_mov_b32 [[SWO:s[0-9]+]], s0 +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x200 +; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0x400{{$}} +; GCN-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0 +; GCN-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]] + +; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], [[CLAMP_IDX]], [[K]] +; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], [[CLAMP_IDX]], [[ZERO]] + +; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +define amdgpu_ps float @ps_main(i32 %idx) { + %v1 = extractelement <81 x float> , i32 %idx + %v2 = extractelement <81 x float> , i32 %idx + %r = fadd float %v1, %v2 + ret float %r +} + +; GCN-LABEL: {{^}}vs_main: +; GCN: s_mov_b32 [[SWO:s[0-9]+]], s0 +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +define amdgpu_vs float @vs_main(i32 %idx) { + %v1 = extractelement <81 x float> , i32 %idx + %v2 = extractelement <81 x float> , i32 %idx + %r = fadd float %v1, %v2 + ret float %r +} + +; GCN-LABEL: {{^}}cs_main: +; GCN: s_mov_b32 [[SWO:s[0-9]+]], s0 +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +define amdgpu_cs float @cs_main(i32 %idx) { + %v1 = extractelement <81 x float> , i32 %idx + %v2 = extractelement <81 x float> , i32 %idx + %r = fadd float %v1, %v2 + ret float %r +} + +; GCN-LABEL: {{^}}hs_main: +; SI: s_mov_b32 [[SWO:s[0-9]+]], s0 +; GFX9: s_mov_b32 [[SWO:s[0-9]+]], s5 +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +define amdgpu_hs float @hs_main(i32 %idx) { + %v1 = extractelement <81 x float> , i32 %idx + %v2 = extractelement <81 x float> , i32 %idx + %r = fadd float %v1, %v2 + ret float %r +} + +; GCN-LABEL: {{^}}gs_main: +; SI: s_mov_b32 [[SWO:s[0-9]+]], s0 +; GFX9: s_mov_b32 [[SWO:s[0-9]+]], s5 +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +define amdgpu_gs float @gs_main(i32 %idx) { + %v1 = extractelement <81 x float> , i32 %idx + %v2 = extractelement <81 x float> , i32 %idx + %r = fadd float %v1, %v2 + ret float %r +} + +; GCN-LABEL: {{^}}hs_ir_uses_scratch_offset: +; SI: s_mov_b32 [[SWO:s[0-9]+]], s6 +; GFX9: s_mov_b32 [[SWO:s[0-9]+]], s5 +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +; GCN: s_mov_b32 s2, s5 +define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) { + %v1 = extractelement <81 x float> , i32 %idx + %v2 = extractelement <81 x float> , i32 %idx + %f = fadd float %v1, %v2 + %r1 = insertvalue <{i32, i32, i32, float}> undef, i32 %swo, 2 + %r2 = insertvalue <{i32, i32, i32, float}> %r1, float %f, 3 + ret <{i32, i32, i32, float}> %r2 +} + +; GCN-LABEL: {{^}}gs_ir_uses_scratch_offset: +; SI: s_mov_b32 [[SWO:s[0-9]+]], s6 +; GFX9: s_mov_b32 [[SWO:s[0-9]+]], s5 +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen +; GCN: s_mov_b32 s2, s5 +define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) { + %v1 = extractelement <81 x float> , i32 %idx + %v2 = extractelement <81 x float> , i32 %idx + %f = fadd float %v1, %v2 + %r1 = insertvalue <{i32, i32, i32, float}> undef, i32 %swo, 2 + %r2 = insertvalue <{i32, i32, i32, float}> %r1, float %f, 3 + ret <{i32, i32, i32, float}> %r2 +}