diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -525,6 +525,7 @@ // The pointer to the GIT is formed from the offset passed in and either // the amdgpu-git-ptr-high function attribute or the top part of the PC Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); + Register Rsrc03 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); buildGitPtr(MBB, I, DL, TII, Rsrc01); @@ -546,6 +547,20 @@ .addImm(0) // cpol .addReg(ScratchRsrcReg, RegState::ImplicitDefine) .addMemOperand(MMO); + + // The driver will always set the SRD for wave 64 (bits 118:117 of + // descriptor / bits 22:21 of third sub-reg will be 0b11) + // If the shader is actually wave32 we have to modify the const_index_stride + // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The + // reason the driver does this is that there can be cases where it presents + // 2 shaders with different wave size (e.g. VsFs). + // TODO: convert to using SCRATCH instructions or multiple SRD buffers + if (ST.isWave32()) { + const MCInstrDesc &SBitsetB32 = TII->get(AMDGPU::S_BITSET0_B32); + BuildMI(MBB, I, DL, SBitsetB32, Rsrc03) + .addImm(21) + .addReg(Rsrc03, RegState::ImplicitDefine); + } } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) { assert(!ST.isAmdHsaOrMesa(Fn)); const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); diff --git a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll --- a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll @@ -3,7 +3,8 @@ ; RUN: opt -S -mtriple=amdgcn-amd-amdpal -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=GCN %s ; Check that it doesn't crash -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s target datalayout = "A5" @@ -13,8 +14,8 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_getpc_b64 s[36:37] ; GFX9-NEXT: s_mov_b32 s36, s0 -; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_load_dwordx4 s[36:39], s[36:37], 0x10 +; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s36, s36, s0 @@ -23,6 +24,23 @@ ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: test_simple_indirect_call: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_getpc_b64 s[36:37] +; GFX10-NEXT: s_mov_b32 s36, s0 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_load_dwordx4 s[36:39], s[36:37], 0x10 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_bitset0_b32 s39, 21 +; GFX10-NEXT: s_mov_b32 s32, 0 +; GFX10-NEXT: s_add_u32 s36, s36, s0 +; GFX10-NEXT: s_addc_u32 s37, s37, 0 +; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: s_endpgm + %pc = call i64 @llvm.amdgcn.s.getpc() %fun = inttoptr i64 %pc to void()*