Index: llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -958,10 +958,14 @@ } static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect, - bool IsTailCall) { + bool IsTailCall, CallingConv::ID CC) { assert(!(IsIndirect && IsTailCall) && "Indirect calls can't be tail calls, " "because the address can be divergent"); - return IsTailCall ? AMDGPU::SI_TCRETURN : AMDGPU::G_SI_CALL; + if (!IsTailCall) + return AMDGPU::G_SI_CALL; + + return CC == CallingConv::AMDGPU_Gfx ? AMDGPU::SI_TCRETURN_GFX : + AMDGPU::SI_TCRETURN; } // Add operands to call instruction to track the callee. @@ -1186,7 +1190,7 @@ if (!IsSibCall) CallSeqStart = MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP); - unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), true); + unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), true, CalleeCC); auto MIB = MIRBuilder.buildInstrNoInsert(Opc); if (!addCallTargetOperands(MIB, MIRBuilder, Info)) return false; @@ -1350,7 +1354,7 @@ // Create a temporarily-floating call instruction so we can add the implicit // uses of arg registers. - unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false); + unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false, Info.CallConv); auto MIB = MIRBuilder.buildInstrNoInsert(Opc); MIB.addDef(TRI->getReturnAddressReg(MF)); Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -367,6 +367,7 @@ // Function call. CALL, TC_RETURN, + TC_RETURN_GFX, TRAP, // Masked control flow nodes. Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4588,6 +4588,7 @@ NODE_NAME_CASE(LOOP) NODE_NAME_CASE(CALL) NODE_NAME_CASE(TC_RETURN) + NODE_NAME_CASE(TC_RETURN_GFX) NODE_NAME_CASE(TRAP) NODE_NAME_CASE(RET_GLUE) NODE_NAME_CASE(RETURN_TO_EPILOG) Index: llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -90,6 +90,11 @@ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic] >; +def AMDGPUtc_return_gfx: SDNode<"AMDGPUISD::TC_RETURN_GFX", + SDTypeProfile<0, 3, [SDTCisPtrTy<0>]>, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic] +>; + def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP", SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>, [SDNPHasChain, SDNPVariadic, SDNPSideEffect, SDNPInGlue] Index: llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -134,7 +134,8 @@ OutMI.addOperand(Dest); OutMI.addOperand(Src); return; - } else if (Opcode == AMDGPU::SI_TCRETURN) { + } else if (Opcode == AMDGPU::SI_TCRETURN || + Opcode == AMDGPU::SI_TCRETURN_GFX) { // TODO: How to use branch immediate and avoid register+add? Opcode = AMDGPU::S_SETPC_B64; } Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3396,7 +3396,9 @@ // actual call instruction. if (IsTailCall) { MFI.setHasTailCall(); - return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops); + unsigned OPC = CallConv == CallingConv::AMDGPU_Gfx ? + AMDGPUISD::TC_RETURN_GFX : AMDGPUISD::TC_RETURN; + return DAG.getNode(OPC, DL, NodeTys, Ops); } // Returns a chain and a flag for retval copy to use. Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -614,12 +614,34 @@ let isConvergent = 1; } +// Tail call handling pseudo for AMDGPU_gfx +def SI_TCRETURN_GFX : SPseudoInstSI <(outs), + (ins Gfx_CCR_SGPR_64:$src0, unknown:$callee, i32imm:$fpdiff), + [(AMDGPUtc_return_gfx i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> { + let Size = 4; + let FixedSize = 1; + let isCall = 1; + let isTerminator = 1; + let isReturn = 1; + let isBarrier = 1; + let UseNamedOperandTable = 1; + let SchedRW = [WriteBranch]; + // TODO: Should really base this on the call target + let isConvergent = 1; +} + // Handle selecting indirect tail calls def : GCNPat< (AMDGPUtc_return i64:$src0, (i64 0), (i32 timm:$fpdiff)), (SI_TCRETURN CCR_SGPR_64:$src0, (i64 0), i32imm:$fpdiff) >; +// Handle selecting indirect tail calls for AMDGPU_gfx +def : GCNPat< + (AMDGPUtc_return_gfx i64:$src0, (i64 0), (i32 timm:$fpdiff)), + (SI_TCRETURN_GFX Gfx_CCR_SGPR_64:$src0, (i64 0), i32imm:$fpdiff) +>; + def ADJCALLSTACKUP : SPseudoInstSI< (outs), (ins i32imm:$amt0, i32imm:$amt1), [(callseq_start timm:$amt0, timm:$amt1)], Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.td =================================================================== --- llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -817,12 +817,21 @@ let HasSGPR = 1; } -// CCR (call clobbered registers) SGPR 64-bit registers, currently used for tail call -// return address only. -def CCR_SGPR_64 : SIRegisterClass<"AMDGPU", SGPR_64.RegTypes, 32, (add (trunc SGPR_64, 16))> { +// CCR (call clobbered registers) SGPR 64-bit registers +def CCR_SGPR_64 : SIRegisterClass<"AMDGPU", SGPR_64.RegTypes, 32, + (add (trunc SGPR_64, 16))> { let CopyCost = SGPR_64.CopyCost; let AllocationPriority = SGPR_64.AllocationPriority; - let HasSGPR = SGPR_64.HasSGPR; + let HasSGPR = 1; +} + +// Call clobbered 64-bit SGPRs for AMDGPU_Gfx CC +def Gfx_CCR_SGPR_64 : SIRegisterClass<"AMDGPU", SGPR_64.RegTypes, 32, + (add (trunc (shl SGPR_64, 15), 1), // s[30:31] + (trunc (shl SGPR_64, 18), 14))> { // s[36:37]-s[s62:63] + let CopyCost = SGPR_64.CopyCost; + let AllocationPriority = SGPR_64.AllocationPriority; + let HasSGPR = 1; } def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, Index: llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll +++ llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll @@ -8,15 +8,15 @@ define amdgpu_kernel void @s_input_output_i128() { ; GFX908-LABEL: name: s_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6946826 /* regdef:SGPR_128 */, def %4 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7274506 /* regdef:SGPR_128 */, def %4 ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %4 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6946825 /* reguse:SGPR_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7274505 /* reguse:SGPR_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A-LABEL: name: s_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6946826 /* regdef:SGPR_128 */, def %4 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7274506 /* regdef:SGPR_128 */, def %4 ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %4 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6946825 /* reguse:SGPR_128 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7274505 /* reguse:SGPR_128 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=s"() call void asm sideeffect "; use $0", "s"(i128 %val) @@ -26,15 +26,15 @@ define amdgpu_kernel void @v_input_output_i128() { ; GFX908-LABEL: name: v_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5898250 /* regdef:VReg_128 */, def %4 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %4 ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %4 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5898249 /* reguse:VReg_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6225929 /* reguse:VReg_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A-LABEL: name: v_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128_Align2 */, def %4 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %4 ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %4 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6225929 /* reguse:VReg_128_Align2 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6553609 /* reguse:VReg_128_Align2 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=v"() call void asm sideeffect "; use $0", "v"(i128 %val) @@ -44,15 +44,15 @@ define amdgpu_kernel void @a_input_output_i128() { ; GFX908-LABEL: name: a_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5832714 /* regdef:AReg_128 */, def %4 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6160394 /* regdef:AReg_128 */, def %4 ; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %4 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5832713 /* reguse:AReg_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6160393 /* reguse:AReg_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A-LABEL: name: a_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6094858 /* regdef:AReg_128_Align2 */, def %4 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:AReg_128_Align2 */, def %4 ; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %4 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6094857 /* reguse:AReg_128_Align2 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:AReg_128_Align2 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = call i128 asm sideeffect "; def $0", "=a"() call void asm sideeffect "; use $0", "a"(i128 %val) Index: llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll +++ llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll @@ -11,7 +11,7 @@ ; REGALLOC-GFX908-NEXT: liveins: $sgpr4_sgpr5 ; REGALLOC-GFX908-NEXT: {{ $}} ; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1703945 /* reguse:AGPR_32 */, undef %5:agpr_32 - ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5898250 /* regdef:VReg_128 */, def %26 + ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %26 ; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:av_128 = COPY %26 ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3080202 /* regdef:VReg_64 */, def %23 ; REGALLOC-GFX908-NEXT: SI_SPILL_V64_SAVE %23, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) @@ -35,7 +35,7 @@ ; PEI-GFX908-NEXT: $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 ; PEI-GFX908-NEXT: $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 ; PEI-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1703945 /* reguse:AGPR_32 */, undef renamable $agpr0 - ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5898250 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 + ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 ; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3080202 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1 ; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) @@ -58,7 +58,7 @@ ; REGALLOC-GFX90A-NEXT: liveins: $sgpr4_sgpr5 ; REGALLOC-GFX90A-NEXT: {{ $}} ; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1703945 /* reguse:AGPR_32 */, undef %5:agpr_32 - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128_Align2 */, def %25 + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %25 ; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:av_128_align2 = COPY %25 ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3407882 /* regdef:VReg_64_Align2 */, def %23 ; REGALLOC-GFX90A-NEXT: SI_SPILL_V64_SAVE %23, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) @@ -80,7 +80,7 @@ ; PEI-GFX90A-NEXT: $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 ; PEI-GFX90A-NEXT: $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 ; PEI-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1703945 /* reguse:AGPR_32 */, undef renamable $agpr0 - ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 + ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 ; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3407882 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1 ; PEI-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) Index: llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll +++ llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll @@ -2,8 +2,6 @@ ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN -enable-var-scope %s ; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN -enable-var-scope %s -; FIXME: @caller uses s[4:5] to store the address of @callee. -; These registers are callee-save in the amdgpu_gfx calling convention, so they must not be clobbered, but they are clobbered here. ; Callee with VGPR arguments define hidden amdgpu_gfx float @callee(float %v.arg0, float %v.arg1) { @@ -20,22 +18,12 @@ ; GCN-LABEL: caller: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: v_writelane_b32 v2, s4, 0 -; GCN-NEXT: v_writelane_b32 v2, s5, 1 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, callee@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, callee@rel32@hi+12 ; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 2.0 -; GCN-NEXT: v_readlane_b32 s5, v2, 1 -; GCN-NEXT: v_readlane_b32 s4, v2, 0 -; GCN-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN-NEXT: s_getpc_b64 s[36:37] +; GCN-NEXT: s_add_u32 s36, s36, callee@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s37, s37, callee@rel32@hi+12 +; GCN-NEXT: s_setpc_b64 s[36:37] %add = fadd float %arg0, 1.0 %call = tail call amdgpu_gfx float @callee(float %add, float 2.0) ret float %call