diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -717,7 +717,7 @@ if (!IsEntryFunc) { // For the fixed ABI, pass workitem IDs in the last argument register. - if (AMDGPUTargetMachine::EnableFixedFunctionABI) + if (AMDGPUTargetMachine::EnableFixedFunctionABI && !IsGraphics) TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); } @@ -1237,7 +1237,7 @@ SmallVector, 12> ImplicitArgRegs; if (AMDGPUTargetMachine::EnableFixedFunctionABI && - Info.CallConv != CallingConv::AMDGPU_Gfx) { + !AMDGPU::isGraphics(CalleeCC)) { // With a fixed ABI, allocate fixed registers before user arguments. if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info)) return false; @@ -1304,7 +1304,7 @@ const DataLayout &DL = F.getParent()->getDataLayout(); if (!AMDGPUTargetMachine::EnableFixedFunctionABI && - Info.CallConv != CallingConv::AMDGPU_Gfx) { + !AMDGPU::isGraphics(Info.CallConv)) { LLVM_DEBUG(dbgs() << "Variable function ABI not implemented\n"); return false; } @@ -1363,7 +1363,7 @@ SmallVector, 12> ImplicitArgRegs; if (AMDGPUTargetMachine::EnableFixedFunctionABI && - Info.CallConv != CallingConv::AMDGPU_Gfx) { + !AMDGPU::isGraphics(Info.CallConv)) { // With a fixed ABI, allocate fixed registers before user arguments. if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info)) return false; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2064,7 +2064,8 @@ auto &ArgInfo = Info.getArgInfo(); // We need to allocate these in place regardless of their use. - const bool IsFixed = AMDGPUTargetMachine::EnableFixedFunctionABI; + const bool IsFixed = AMDGPUTargetMachine::EnableFixedFunctionABI && + !AMDGPU::isGraphics(CCInfo.getCallingConv()); // TODO: Unify handling with private memory pointers. if (IsFixed || Info.hasDispatchPtr()) @@ -2422,7 +2423,7 @@ allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info); } else { // For the fixed ABI, pass workitem IDs in the last argument register. - if (AMDGPUTargetMachine::EnableFixedFunctionABI) + if (AMDGPUTargetMachine::EnableFixedFunctionABI && !IsGraphics) allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); } @@ -3125,7 +3126,7 @@ CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg); if (AMDGPUTargetMachine::EnableFixedFunctionABI && - CallConv != CallingConv::AMDGPU_Gfx) { + !AMDGPU::isGraphics(CallConv)) { // With a fixed ABI, allocate fixed registers before user arguments. passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain); } @@ -3265,7 +3266,7 @@ } if (!AMDGPUTargetMachine::EnableFixedFunctionABI && - CallConv != CallingConv::AMDGPU_Gfx) { + !AMDGPU::isGraphics(CallConv)) { // Copy special input registers after user input arguments. passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain); } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -65,7 +65,7 @@ // Enable all kernel inputs if we have the fixed ABI. Don't bother if we don't // have any calls. const bool UseFixedABI = AMDGPUTargetMachine::EnableFixedFunctionABI && - CC != CallingConv::AMDGPU_Gfx && + !AMDGPU::isGraphics(CC) && (!isEntryFunction() || HasCalls); const bool IsKernel = CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL; diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll --- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll @@ -147,7 +147,8 @@ ; GCN: amdpal.pipelines: ; GCN-NEXT: - .registers: -; GCN-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ca{{$}} +; SDAG-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ca{{$}} +; GISEL-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ce{{$}} ; GCN-NEXT: 0x2e13 (COMPUTE_PGM_RSRC2): 0x8001{{$}} ; GCN-NEXT: .shader_functions: ; GCN-NEXT: dynamic_stack: @@ -189,13 +190,15 @@ ; GFX8-NEXT: .sgpr_count: 0x28{{$}} ; GFX9-NEXT: .sgpr_count: 0x2c{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x90{{$}} -; GCN-NEXT: .vgpr_count: 0x2a{{$}} +; SDAG-NEXT: .vgpr_count: 0x2a{{$}} +; GISEL-NEXT: .vgpr_count: 0x34{{$}} ; GCN-NEXT: no_stack_indirect_call: ; GCN-NEXT: .lds_size: 0{{$}} ; GFX8-NEXT: .sgpr_count: 0x28{{$}} ; GFX9-NEXT: .sgpr_count: 0x2c{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} -; GCN-NEXT: .vgpr_count: 0x2a{{$}} +; SDAG-NEXT: .vgpr_count: 0x2a{{$}} +; GISEL-NEXT: .vgpr_count: 0x34{{$}} ; GCN-NEXT: simple_lds: ; GCN-NEXT: .lds_size: 0x100{{$}} ; GCN-NEXT: .sgpr_count: 0x20{{$}} @@ -227,7 +230,8 @@ ; GFX8-NEXT: .sgpr_count: 0x28{{$}} ; GFX9-NEXT: .sgpr_count: 0x2c{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}} -; GCN-NEXT: .vgpr_count: 0x2b{{$}} +; SDAG-NEXT: .vgpr_count: 0x2b{{$}} +; GISEL-NEXT: .vgpr_count: 0x34{{$}} ; GCN-NEXT: simple_stack_recurse: ; GCN-NEXT: .lds_size: 0{{$}} ; GCN-NEXT: .sgpr_count: 0x26{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -9778,30 +9778,29 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:4 -; GFX9-NEXT: s_addk_i32 s32, 0x800 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[30:31] ; GFX9-NEXT: s_add_u32 s30, s30, stack_passed_f64_arg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s31, s31, stack_passed_f64_arg@rel32@hi+12 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:8 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: s_addk_i32 s32, 0xf800 +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -9811,20 +9810,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:4 -; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:8 -; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s33 -; GFX10-NEXT: s_addk_i32 s32, 0x400 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[30:31] @@ -9833,10 +9831,10 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: s_addk_i32 s32, 0xfc00 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -9847,29 +9845,27 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 offset:12 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 offset:8 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33 offset:4 -; GFX10-SCRATCH-NEXT: scratch_load_dword v31, off, s33 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, stack_passed_f64_arg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, stack_passed_f64_arg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(1) +; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[32:33], s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:12 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)