Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -69,8 +69,6 @@ [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisVT<0, i32>, SDTCisVT<1, i1>, SDTCisVT<4, i1>] >; -def SDT_AMDGPUTCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>; - //===----------------------------------------------------------------------===// // AMDGPU DAG Nodes // @@ -95,7 +93,8 @@ SDNPVariadic] >; -def AMDGPUtc_return: SDNode<"AMDGPUISD::TC_RETURN", SDT_AMDGPUTCRET, +def AMDGPUtc_return: SDNode<"AMDGPUISD::TC_RETURN", + SDTypeProfile<0, 3, [SDTCisPtrTy<0>]>, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic] >; Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2711,6 +2711,11 @@ std::vector Ops; Ops.push_back(Chain); Ops.push_back(Callee); + // Add a redundant copy of the callee global which will not be legalized, as + // we need direct access to the callee later. + GlobalAddressSDNode *GSD = cast(Callee); + const GlobalValue *GV = GSD->getGlobal(); + Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64)); if (IsTailCall) { // Each tail call may have to adjust the stack by a different amount, so @@ -3474,34 +3479,16 @@ .addReg(Info->getFrameOffsetReg(), RegState::Implicit); return BB; } - case AMDGPU::SI_CALL_ISEL: - case AMDGPU::SI_TCRETURN_ISEL: { + case AMDGPU::SI_CALL_ISEL: { const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); const DebugLoc &DL = MI.getDebugLoc(); - unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF); - MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned GlobalAddrReg = MI.getOperand(0).getReg(); - MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg); - assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET); - - const GlobalValue *G = PCRel->getOperand(1).getGlobal(); + unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF); MachineInstrBuilder MIB; - if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) { - MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg) - .add(MI.getOperand(0)) - .addGlobalAddress(G); - } else { - MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN)) - .add(MI.getOperand(0)) - .addGlobalAddress(G); - - // There is an additional imm operand for tcreturn, but it should be in the - // right place already. - } + MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg); - for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) + for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) MIB.add(MI.getOperand(I)); MIB.cloneMemRefs(MI); @@ -4008,7 +3995,10 @@ } bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const { - return (GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || + // FIXME: Either avoid relying on address space here or change the default + // address space for functions to avoid the explicit check. + return (GV->getValueType()->isFunctionTy() || + GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && !shouldEmitFixup(GV) && Index: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td +++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td @@ -373,7 +373,8 @@ // This version is only needed so we can fill in the output regiter in // the custom inserter. def SI_CALL_ISEL : SPseudoInstSI < - (outs), (ins SSrc_b64:$src0), [(AMDGPUcall i64:$src0)]> { + (outs), (ins SSrc_b64:$src0, unknown:$callee), + [(AMDGPUcall i64:$src0, tglobaladdr:$callee)]> { let Size = 4; let isCall = 1; let SchedRW = [WriteBranch]; @@ -391,20 +392,9 @@ } // Tail call handling pseudo -def SI_TCRETURN_ISEL : SPseudoInstSI<(outs), - (ins SSrc_b64:$src0, i32imm:$fpdiff), - [(AMDGPUtc_return i64:$src0, i32:$fpdiff)]> { - let isCall = 1; - let isTerminator = 1; - let isReturn = 1; - let isBarrier = 1; - let SchedRW = [WriteBranch]; - let usesCustomInserter = 1; -} - -def SI_TCRETURN : SPseudoInstSI < - (outs), - (ins SSrc_b64:$src0, unknown:$callee, i32imm:$fpdiff)> { +def SI_TCRETURN : SPseudoInstSI <(outs), + (ins SSrc_b64:$src0, unknown:$callee, i32imm:$fpdiff), + [(AMDGPUtc_return i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> { let Size = 4; let isCall = 1; let isTerminator = 1; Index: llvm/trunk/test/CodeGen/AMDGPU/function-call-relocs.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/function-call-relocs.ll +++ llvm/trunk/test/CodeGen/AMDGPU/function-call-relocs.ll @@ -0,0 +1,51 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji < %s | FileCheck %s + +declare void @func(i32 addrspace(1)* %out) + +declare protected void @protected_func(i32 addrspace(1)* %out) + +declare hidden void @hidden_func(i32 addrspace(1)* %out) + +; CHECK-LABEL: call_func: +; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; CHECK: s_add_u32 s[[GOT_ADDR_LO:[0-9]+]], s[[PC_LO]], func@gotpcrel32@lo+4 +; CHECK: s_addc_u32 s[[GOT_ADDR_HI:[0-9]+]], s[[PC_HI]], func@gotpcrel32@hi+4 +; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOT_ADDR_LO]]:[[GOT_ADDR_HI]]{{\]}}, 0x0 +; CHECK: s_swappc_b64 s{{\[}}{{[0-9]+:[0-9]+}}{{\]}}, s{{\[}}[[ADDR_LO]]:[[ADDR_HI]]{{\]}} +define amdgpu_kernel void @call_func(i32 addrspace(1)* %out) { + call void @func(i32 addrspace(1)* %out) + ret void +} + +; CHECK-LABEL: call_protected_func: +; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; CHECK: s_add_u32 s[[ADDR_LO:[0-9]+]], s[[PC_LO]], protected_func@rel32@lo+4 +; CHECK: s_addc_u32 s[[ADDR_HI:[0-9]+]], s[[PC_HI]], protected_func@rel32@hi+4 +; CHECK: s_swappc_b64 s{{\[}}{{[0-9]+:[0-9]+}}{{\]}}, s{{\[}}[[ADDR_LO]]:[[ADDR_HI]]{{\]}} +define amdgpu_kernel void @call_protected_func(i32 addrspace(1)* %out) { + call void @protected_func(i32 addrspace(1)* %out) + ret void +} + +; CHECK-LABEL: call_hidden_func: +; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; CHECK: s_add_u32 s[[ADDR_LO:[0-9]+]], s[[PC_LO]], hidden_func@rel32@lo+4 +; CHECK: s_addc_u32 s[[ADDR_HI:[0-9]+]], s[[PC_HI]], hidden_func@rel32@hi+4 +; CHECK: s_swappc_b64 s{{\[}}{{[0-9]+:[0-9]+}}{{\]}}, s{{\[}}[[ADDR_LO]]:[[ADDR_HI]]{{\]}} +define amdgpu_kernel void @call_hidden_func(i32 addrspace(1)* %out) { + call void @hidden_func(i32 addrspace(1)* %out) + ret void +} + +declare i64 @funci() + +; CHECK-LABEL: tail_call_func: +; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; CHECK: s_add_u32 s[[GOT_ADDR_LO:[0-9]+]], s[[PC_LO]], funci@gotpcrel32@lo+4 +; CHECK: s_addc_u32 s[[GOT_ADDR_HI:[0-9]+]], s[[PC_HI]], funci@gotpcrel32@hi+4 +; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOT_ADDR_LO]]:[[GOT_ADDR_HI]]{{\]}}, 0x0 +; CHECK: s_setpc_b64 s{{\[}}[[ADDR_LO]]:[[ADDR_HI]]{{\]}} +define i64 @tail_call_func() { + %ret = tail call i64 @funci() + ret i64 %ret +}