Index: llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -913,16 +913,24 @@ InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Z).getReg(0) : Z; } - if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) { + if (!InputReg && + (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) { InputReg = MRI.createGenericVirtualRegister(S32); - - // Workitem ids are already packed, any of present incoming arguments will - // carry all required fields. - ArgDescriptor IncomingArg = ArgDescriptor::createArg( - IncomingArgX ? *IncomingArgX : + if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) { + // We're in a situation where the outgoing function requires the workitem + // ID, but the calling function does not have it (e.g a graphics function + // calling a C calling convention function). This is illegal, but we need + // to produce something. + MIRBuilder.buildUndef(InputReg); + } else { + // Workitem ids are already packed, any of present incoming arguments will + // carry all required fields. + ArgDescriptor IncomingArg = ArgDescriptor::createArg( + IncomingArgX ? *IncomingArgX : IncomingArgY ? *IncomingArgY : *IncomingArgZ, ~0u); - LI->loadInputValue(InputReg, MIRBuilder, &IncomingArg, - &AMDGPU::VGPR_32RegClass, S32); + LI->loadInputValue(InputReg, MIRBuilder, &IncomingArg, + &AMDGPU::VGPR_32RegClass, S32); + } } if (OutgoingArg->isRegister()) { Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2911,13 +2911,21 @@ } if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) { - // Workitem ids are already packed, any of present incoming arguments - // will carry all required fields. - ArgDescriptor IncomingArg = ArgDescriptor::createArg( - IncomingArgX ? *IncomingArgX : - IncomingArgY ? *IncomingArgY : - *IncomingArgZ, ~0u); - InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg); + if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) { + // We're in a situation where the outgoing function requires the workitem + // ID, but the calling function does not have it (e.g a graphics function + // calling a C calling convention function). This is illegal, but we need + // to produce something. + InputReg = DAG.getUNDEF(MVT::i32); + } else { + // Workitem ids are already packed, any of present incoming arguments + // will carry all required fields. + ArgDescriptor IncomingArg = ArgDescriptor::createArg( + IncomingArgX ? *IncomingArgX : + IncomingArgY ? *IncomingArgY : + *IncomingArgZ, ~0u); + InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg); + } } if (OutgoingArg->isRegister()) { Index: llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll @@ -5175,6 +5175,39 @@ ret void } +define amdgpu_ps void @amdgpu_ps_call_default_cc() { + ; CHECK-LABEL: name: amdgpu_ps_call_default_cc + ; CHECK: bb.1.main_body: + ; CHECK-NEXT: [[C:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY [[DEF]](p4) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(p4) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[C1]], [[C2]](s64) + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[DEF2]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[DEF2]](s32) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[DEF2]](s32) + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY4]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[DEF]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY]](p4) + ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[DEF1]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY1]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY2]](s32) + ; CHECK-NEXT: $vgpr31 = COPY [[COPY3]](s32) + ; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[C]](p0), 0, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; CHECK-NEXT: S_ENDPGM 0 +main_body: + call void null() + ret void +} + attributes #0 = { nounwind } attributes #1 = { nounwind readnone } attributes #2 = { nounwind noinline } Index: llvm/test/CodeGen/AMDGPU/amdpal-callable.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/amdpal-callable.ll +++ llvm/test/CodeGen/AMDGPU/amdpal-callable.ll @@ -2,8 +2,8 @@ ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG,GFX9 -enable-var-scope %s ; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL,GFX9 -enable-var-scope %s -declare float @extern_func(float) #0 -declare float @extern_func_many_args(<64 x float>) #0 +declare amdgpu_gfx float @extern_func(float) #0 +declare amdgpu_gfx float @extern_func_many_args(<64 x float>) #0 @funcptr = external hidden unnamed_addr addrspace(4) constant void()*, align 4 Index: llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll @@ -0,0 +1,163 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=SDAG -enable-var-scope %s +; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL -enable-var-scope %s + +declare void @extern_c_func() + +define amdgpu_gfx void @gfx_func() { +; SDAG-LABEL: gfx_func: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_or_saveexec_b64 s[34:35], -1 +; SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SDAG-NEXT: s_mov_b64 exec, s[34:35] +; SDAG-NEXT: v_writelane_b32 v40, s33, 26 +; SDAG-NEXT: v_writelane_b32 v40, s4, 0 +; SDAG-NEXT: v_writelane_b32 v40, s5, 1 +; SDAG-NEXT: v_writelane_b32 v40, s6, 2 +; SDAG-NEXT: v_writelane_b32 v40, s7, 3 +; SDAG-NEXT: v_writelane_b32 v40, s8, 4 +; SDAG-NEXT: v_writelane_b32 v40, s9, 5 +; SDAG-NEXT: v_writelane_b32 v40, s10, 6 +; SDAG-NEXT: v_writelane_b32 v40, s11, 7 +; SDAG-NEXT: v_writelane_b32 v40, s12, 8 +; SDAG-NEXT: v_writelane_b32 v40, s13, 9 +; SDAG-NEXT: v_writelane_b32 v40, s14, 10 +; SDAG-NEXT: v_writelane_b32 v40, s15, 11 +; SDAG-NEXT: v_writelane_b32 v40, s16, 12 +; SDAG-NEXT: v_writelane_b32 v40, s17, 13 +; SDAG-NEXT: v_writelane_b32 v40, s18, 14 +; SDAG-NEXT: v_writelane_b32 v40, s19, 15 +; SDAG-NEXT: v_writelane_b32 v40, s20, 16 +; SDAG-NEXT: v_writelane_b32 v40, s21, 17 +; SDAG-NEXT: s_mov_b32 s33, s32 +; SDAG-NEXT: s_addk_i32 s32, 0x400 +; SDAG-NEXT: v_writelane_b32 v40, s22, 18 +; SDAG-NEXT: v_writelane_b32 v40, s23, 19 +; SDAG-NEXT: s_mov_b64 s[36:37], s[30:31] +; SDAG-NEXT: s_getpc_b64 s[30:31] +; SDAG-NEXT: s_add_u32 s30, s30, extern_c_func@gotpcrel32@lo+4 +; SDAG-NEXT: s_addc_u32 s31, s31, extern_c_func@gotpcrel32@hi+12 +; SDAG-NEXT: v_writelane_b32 v40, s24, 20 +; SDAG-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 +; SDAG-NEXT: v_writelane_b32 v40, s25, 21 +; SDAG-NEXT: v_writelane_b32 v40, s26, 22 +; SDAG-NEXT: v_writelane_b32 v40, s27, 23 +; SDAG-NEXT: v_writelane_b32 v40, s28, 24 +; SDAG-NEXT: s_mov_b64 s[8:9], 0 +; SDAG-NEXT: v_writelane_b32 v40, s29, 25 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: s_swappc_b64 s[30:31], s[30:31] +; SDAG-NEXT: v_readlane_b32 s29, v40, 25 +; SDAG-NEXT: v_readlane_b32 s28, v40, 24 +; SDAG-NEXT: v_readlane_b32 s27, v40, 23 +; SDAG-NEXT: v_readlane_b32 s26, v40, 22 +; SDAG-NEXT: v_readlane_b32 s25, v40, 21 +; SDAG-NEXT: v_readlane_b32 s24, v40, 20 +; SDAG-NEXT: v_readlane_b32 s23, v40, 19 +; SDAG-NEXT: v_readlane_b32 s22, v40, 18 +; SDAG-NEXT: v_readlane_b32 s21, v40, 17 +; SDAG-NEXT: v_readlane_b32 s20, v40, 16 +; SDAG-NEXT: v_readlane_b32 s19, v40, 15 +; SDAG-NEXT: v_readlane_b32 s18, v40, 14 +; SDAG-NEXT: v_readlane_b32 s17, v40, 13 +; SDAG-NEXT: v_readlane_b32 s16, v40, 12 +; SDAG-NEXT: v_readlane_b32 s15, v40, 11 +; SDAG-NEXT: v_readlane_b32 s14, v40, 10 +; SDAG-NEXT: v_readlane_b32 s13, v40, 9 +; SDAG-NEXT: v_readlane_b32 s12, v40, 8 +; SDAG-NEXT: v_readlane_b32 s11, v40, 7 +; SDAG-NEXT: v_readlane_b32 s10, v40, 6 +; SDAG-NEXT: v_readlane_b32 s9, v40, 5 +; SDAG-NEXT: v_readlane_b32 s8, v40, 4 +; SDAG-NEXT: v_readlane_b32 s7, v40, 3 +; SDAG-NEXT: v_readlane_b32 s6, v40, 2 +; SDAG-NEXT: v_readlane_b32 s5, v40, 1 +; SDAG-NEXT: v_readlane_b32 s4, v40, 0 +; SDAG-NEXT: s_addk_i32 s32, 0xfc00 +; SDAG-NEXT: v_readlane_b32 s33, v40, 26 +; SDAG-NEXT: s_or_saveexec_b64 s[30:31], -1 +; SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SDAG-NEXT: s_mov_b64 exec, s[30:31] +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_setpc_b64 s[36:37] +; +; GISEL-LABEL: gfx_func: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GISEL-NEXT: s_mov_b64 exec, s[34:35] +; GISEL-NEXT: v_writelane_b32 v40, s33, 26 +; GISEL-NEXT: v_writelane_b32 v40, s4, 0 +; GISEL-NEXT: v_writelane_b32 v40, s5, 1 +; GISEL-NEXT: v_writelane_b32 v40, s6, 2 +; GISEL-NEXT: v_writelane_b32 v40, s7, 3 +; GISEL-NEXT: v_writelane_b32 v40, s8, 4 +; GISEL-NEXT: v_writelane_b32 v40, s9, 5 +; GISEL-NEXT: v_writelane_b32 v40, s10, 6 +; GISEL-NEXT: v_writelane_b32 v40, s11, 7 +; GISEL-NEXT: v_writelane_b32 v40, s12, 8 +; GISEL-NEXT: v_writelane_b32 v40, s13, 9 +; GISEL-NEXT: v_writelane_b32 v40, s14, 10 +; GISEL-NEXT: v_writelane_b32 v40, s15, 11 +; GISEL-NEXT: v_writelane_b32 v40, s16, 12 +; GISEL-NEXT: v_writelane_b32 v40, s17, 13 +; GISEL-NEXT: v_writelane_b32 v40, s18, 14 +; GISEL-NEXT: v_writelane_b32 v40, s19, 15 +; GISEL-NEXT: v_writelane_b32 v40, s20, 16 +; GISEL-NEXT: v_writelane_b32 v40, s21, 17 +; GISEL-NEXT: s_mov_b32 s33, s32 +; GISEL-NEXT: s_addk_i32 s32, 0x400 +; GISEL-NEXT: v_writelane_b32 v40, s22, 18 +; GISEL-NEXT: v_writelane_b32 v40, s23, 19 +; GISEL-NEXT: s_mov_b64 s[36:37], s[30:31] +; GISEL-NEXT: s_getpc_b64 s[30:31] +; GISEL-NEXT: s_add_u32 s30, s30, extern_c_func@gotpcrel32@lo+4 +; GISEL-NEXT: s_addc_u32 s31, s31, extern_c_func@gotpcrel32@hi+12 +; GISEL-NEXT: v_writelane_b32 v40, s24, 20 +; GISEL-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 +; GISEL-NEXT: v_writelane_b32 v40, s25, 21 +; GISEL-NEXT: v_writelane_b32 v40, s26, 22 +; GISEL-NEXT: v_writelane_b32 v40, s27, 23 +; GISEL-NEXT: v_writelane_b32 v40, s28, 24 +; GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; GISEL-NEXT: v_writelane_b32 v40, s29, 25 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GISEL-NEXT: v_readlane_b32 s29, v40, 25 +; GISEL-NEXT: v_readlane_b32 s28, v40, 24 +; GISEL-NEXT: v_readlane_b32 s27, v40, 23 +; GISEL-NEXT: v_readlane_b32 s26, v40, 22 +; GISEL-NEXT: v_readlane_b32 s25, v40, 21 +; GISEL-NEXT: v_readlane_b32 s24, v40, 20 +; GISEL-NEXT: v_readlane_b32 s23, v40, 19 +; GISEL-NEXT: v_readlane_b32 s22, v40, 18 +; GISEL-NEXT: v_readlane_b32 s21, v40, 17 +; GISEL-NEXT: v_readlane_b32 s20, v40, 16 +; GISEL-NEXT: v_readlane_b32 s19, v40, 15 +; GISEL-NEXT: v_readlane_b32 s18, v40, 14 +; GISEL-NEXT: v_readlane_b32 s17, v40, 13 +; GISEL-NEXT: v_readlane_b32 s16, v40, 12 +; GISEL-NEXT: v_readlane_b32 s15, v40, 11 +; GISEL-NEXT: v_readlane_b32 s14, v40, 10 +; GISEL-NEXT: v_readlane_b32 s13, v40, 9 +; GISEL-NEXT: v_readlane_b32 s12, v40, 8 +; GISEL-NEXT: v_readlane_b32 s11, v40, 7 +; GISEL-NEXT: v_readlane_b32 s10, v40, 6 +; GISEL-NEXT: v_readlane_b32 s9, v40, 5 +; GISEL-NEXT: v_readlane_b32 s8, v40, 4 +; GISEL-NEXT: v_readlane_b32 s7, v40, 3 +; GISEL-NEXT: v_readlane_b32 s6, v40, 2 +; GISEL-NEXT: v_readlane_b32 s5, v40, 1 +; GISEL-NEXT: v_readlane_b32 s4, v40, 0 +; GISEL-NEXT: s_addk_i32 s32, 0xfc00 +; GISEL-NEXT: v_readlane_b32 s33, v40, 26 +; GISEL-NEXT: s_or_saveexec_b64 s[30:31], -1 +; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GISEL-NEXT: s_mov_b64 exec, s[30:31] +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_setpc_b64 s[36:37] + call void @extern_c_func() + ret void +}