Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -3478,14 +3478,20 @@ case AMDGPU::SI_TCRETURN_ISEL: { const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); const DebugLoc &DL = MI.getDebugLoc(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF); - MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned GlobalAddrReg = MI.getOperand(0).getReg(); - MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg); - assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET); + unsigned AddrReg = MI.getOperand(0).getReg(); + MachineInstr *DefI = MRI.getVRegDef(AddrReg); + + if (DefI->getOpcode() == AMDGPU::S_LOAD_DWORDX2_IMM) { + AddrReg = DefI->getOperand(1).getReg(); + DefI = MRI.getVRegDef(AddrReg); + } + assert(DefI->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET); - const GlobalValue *G = PCRel->getOperand(1).getGlobal(); + const GlobalValue *G = DefI->getOperand(1).getGlobal(); MachineInstrBuilder MIB; if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) { @@ -4008,7 +4014,8 @@ } bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const { - return (GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || + return (GV->getValueType()->isFunctionTy() || + GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && !shouldEmitFixup(GV) && Index: test/CodeGen/AMDGPU/byval-frame-setup.ll =================================================================== --- test/CodeGen/AMDGPU/byval-frame-setup.ll +++ test/CodeGen/AMDGPU/byval-frame-setup.ll @@ -14,7 +14,7 @@ ; GCN-NOT: s32 ; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s5 offset:20{{$}} ; GCN-NOT: s32 -define void @void_func_byval_struct(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1) #1 { +define hidden void @void_func_byval_struct(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1) #1 { entry: %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0 %tmp = load volatile i32, i32 addrspace(5)* %arrayidx, align 4 @@ -193,7 +193,7 @@ ; GCN-NOT: s32 ; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s5 offset:24{{$}} ; GCN-NOT: s32 -define void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg1) #1 { +define hidden void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg1) #1 { entry: %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0 %tmp = load volatile i32, i32 addrspace(5)* %arrayidx, align 8 Index: test/CodeGen/AMDGPU/call-argument-types.ll =================================================================== --- test/CodeGen/AMDGPU/call-argument-types.ll +++ test/CodeGen/AMDGPU/call-argument-types.ll @@ -3,56 +3,56 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,VI,MESA %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,HSA %s -declare void @external_void_func_i1(i1) #0 -declare void @external_void_func_i1_signext(i1 signext) #0 -declare void @external_void_func_i1_zeroext(i1 zeroext) #0 - -declare void @external_void_func_i8(i8) #0 -declare void @external_void_func_i8_signext(i8 signext) #0 -declare void @external_void_func_i8_zeroext(i8 zeroext) #0 - -declare void @external_void_func_i16(i16) #0 -declare void @external_void_func_i16_signext(i16 signext) #0 -declare void @external_void_func_i16_zeroext(i16 zeroext) #0 - -declare void @external_void_func_i32(i32) #0 -declare void @external_void_func_i64(i64) #0 -declare void @external_void_func_v2i64(<2 x i64>) #0 -declare void @external_void_func_v3i64(<3 x i64>) #0 -declare void @external_void_func_v4i64(<4 x i64>) #0 - -declare void @external_void_func_f16(half) #0 -declare void @external_void_func_f32(float) #0 -declare void @external_void_func_f64(double) #0 -declare void @external_void_func_v2f32(<2 x float>) #0 -declare void @external_void_func_v2f64(<2 x double>) #0 -declare void @external_void_func_v3f64(<3 x double>) #0 - -declare void @external_void_func_v2i16(<2 x i16>) #0 -declare void @external_void_func_v2f16(<2 x half>) #0 -declare void @external_void_func_v3i16(<3 x i16>) #0 -declare void @external_void_func_v3f16(<3 x half>) #0 -declare void @external_void_func_v4i16(<4 x i16>) #0 -declare void @external_void_func_v4f16(<4 x half>) #0 - -declare void @external_void_func_v2i32(<2 x i32>) #0 -declare void @external_void_func_v3i32(<3 x i32>) #0 -declare void @external_void_func_v3i32_i32(<3 x i32>, i32) #0 -declare void @external_void_func_v4i32(<4 x i32>) #0 -declare void @external_void_func_v8i32(<8 x i32>) #0 -declare void @external_void_func_v16i32(<16 x i32>) #0 -declare void @external_void_func_v32i32(<32 x i32>) #0 -declare void @external_void_func_v32i32_i32(<32 x i32>, i32) #0 +declare hidden void @external_void_func_i1(i1) #0 +declare hidden void @external_void_func_i1_signext(i1 signext) #0 +declare hidden void @external_void_func_i1_zeroext(i1 zeroext) #0 + +declare hidden void @external_void_func_i8(i8) #0 +declare hidden void @external_void_func_i8_signext(i8 signext) #0 +declare hidden void @external_void_func_i8_zeroext(i8 zeroext) #0 + +declare hidden void @external_void_func_i16(i16) #0 +declare hidden void @external_void_func_i16_signext(i16 signext) #0 +declare hidden void @external_void_func_i16_zeroext(i16 zeroext) #0 + +declare hidden void @external_void_func_i32(i32) #0 +declare hidden void @external_void_func_i64(i64) #0 +declare hidden void @external_void_func_v2i64(<2 x i64>) #0 +declare hidden void @external_void_func_v3i64(<3 x i64>) #0 +declare hidden void @external_void_func_v4i64(<4 x i64>) #0 + +declare hidden void @external_void_func_f16(half) #0 +declare hidden void @external_void_func_f32(float) #0 +declare hidden void @external_void_func_f64(double) #0 +declare hidden void @external_void_func_v2f32(<2 x float>) #0 +declare hidden void @external_void_func_v2f64(<2 x double>) #0 +declare hidden void @external_void_func_v3f64(<3 x double>) #0 + +declare hidden void @external_void_func_v2i16(<2 x i16>) #0 +declare hidden void @external_void_func_v2f16(<2 x half>) #0 +declare hidden void @external_void_func_v3i16(<3 x i16>) #0 +declare hidden void @external_void_func_v3f16(<3 x half>) #0 +declare hidden void @external_void_func_v4i16(<4 x i16>) #0 +declare hidden void @external_void_func_v4f16(<4 x half>) #0 + +declare hidden void @external_void_func_v2i32(<2 x i32>) #0 +declare hidden void @external_void_func_v3i32(<3 x i32>) #0 +declare hidden void @external_void_func_v3i32_i32(<3 x i32>, i32) #0 +declare hidden void @external_void_func_v4i32(<4 x i32>) #0 +declare hidden void @external_void_func_v8i32(<8 x i32>) #0 +declare hidden void @external_void_func_v16i32(<16 x i32>) #0 +declare hidden void @external_void_func_v32i32(<32 x i32>) #0 +declare hidden void @external_void_func_v32i32_i32(<32 x i32>, i32) #0 ; return value and argument -declare i32 @external_i32_func_i32(i32) #0 +declare hidden i32 @external_i32_func_i32(i32) #0 ; Structs -declare void @external_void_func_struct_i8_i32({ i8, i32 }) #0 -declare void @external_void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* byval) #0 -declare void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 } addrspace(5)* sret, { i8, i32 } addrspace(5)* byval) #0 +declare hidden void @external_void_func_struct_i8_i32({ i8, i32 }) #0 +declare hidden void @external_void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* byval) #0 +declare hidden void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 } addrspace(5)* sret, { i8, i32 } addrspace(5)* byval) #0 -declare void @external_void_func_v16i8(<16 x i8>) #0 +declare hidden void @external_void_func_v16i8(<16 x i8>) #0 ; FIXME: Should be passing -1 @@ -764,8 +764,8 @@ ret void } -declare void @byval_align16_f64_arg(<32 x i32>, double addrspace(5)* byval align 16) #0 -declare void @stack_passed_f64_arg(<32 x i32>, double) #0 +declare hidden void @byval_align16_f64_arg(<32 x i32>, double addrspace(5)* byval align 16) #0 +declare hidden void @stack_passed_f64_arg(<32 x i32>, double) #0 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/call-constexpr.ll =================================================================== --- test/CodeGen/AMDGPU/call-constexpr.ll +++ test/CodeGen/AMDGPU/call-constexpr.ll @@ -67,7 +67,7 @@ ; GCN: s_waitcnt ; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: s_setpc_b64 -define i32 @use_workitem_id_x(i32 %arg0) #0 { +define hidden i32 @use_workitem_id_x(i32 %arg0) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %op = add i32 %id, %arg0 ret i32 %op @@ -121,15 +121,15 @@ ; Callees appears last in source file to test that we still lower their ; arguments before we lower any calls to them. -define i32 @ret_i32_noinline() #0 { +define hidden i32 @ret_i32_noinline() #0 { ret i32 4 } -define i32 @ret_i32_alwaysinline() #1 { +define hidden i32 @ret_i32_alwaysinline() #1 { ret i32 4 } -define i32 @ident_i32(i32 %i) #0 { +define hidden i32 @ident_i32(i32 %i) #0 { ret i32 %i } Index: test/CodeGen/AMDGPU/call-preserved-registers.ll =================================================================== --- test/CodeGen/AMDGPU/call-preserved-registers.ll +++ test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -2,7 +2,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -declare void @external_void_func_void() #0 +declare hidden void @external_void_func_void() #0 ; GCN-LABEL: {{^}}test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: ; GCN: s_mov_b32 s33, s7 @@ -84,7 +84,7 @@ ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_setpc_b64 s[30:31] -define void @void_func_void_clobber_vcc() #2 { +define hidden void @void_func_void_clobber_vcc() #2 { call void asm sideeffect "", "~{VCC}"() #0 ret void } @@ -176,7 +176,7 @@ ; GCN-NEXT: #ASMEND ; GCN-NEXT: v_readlane_b32 s33, v0, 0 ; GCN-NEXT: s_setpc_b64 -define void @void_func_void_clobber_s33() #2 { +define hidden void @void_func_void_clobber_s33() #2 { call void asm sideeffect "; clobber", "~{s33}"() #0 ret void } Index: test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll =================================================================== --- test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -166,11 +166,11 @@ } -declare <2 x float> @func_v2f32() #0 -declare <3 x float> @func_v3f32() #0 -declare <4 x float> @func_v4f32() #0 -declare <4 x half> @func_v4f16() #0 +declare hidden <2 x float> @func_v2f32() #0 +declare hidden <3 x float> @func_v3f32() #0 +declare hidden <4 x float> @func_v4f32() #0 +declare hidden <4 x half> @func_v4f16() #0 -declare { <4 x i32>, <4 x half> } @func_struct() #0 +declare hidden { <4 x i32>, <4 x half> } @func_struct() #0 attributes #0 = { nounwind} Index: test/CodeGen/AMDGPU/function-call-relocs.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/function-call-relocs.ll @@ -0,0 +1,49 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji < %s | FileCheck %s + +define void @func(i32 addrspace(1)* %out) #0 { + store i32 1, i32 addrspace(1)* %out + ret void +} + +define protected void @protected_func(i32 addrspace(1)* %out) #0 { + store i32 1, i32 addrspace(1)* %out + ret void +} + +define hidden void @hidden_func(i32 addrspace(1)* %out) #0 { + store i32 1, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: call_func: +; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; CHECK: s_add_u32 s[[GOT_ADDR_LO:[0-9]+]], s[[PC_LO]], func@gotpcrel32@lo+4 +; CHECK: s_addc_u32 s[[GOT_ADDR_HI:[0-9]+]], s[[PC_HI]], func@gotpcrel32@hi+4 +; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOT_ADDR_LO]]:[[GOT_ADDR_HI]]{{\]}}, 0x0 +; CHECK: s_swappc_b64 s{{\[}}{{[0-9]+:[0-9]+}}{{\]}}, s{{\[}}[[ADDR_LO]]:[[ADDR_HI]]{{\]}} +define amdgpu_kernel void @call_func(i32 addrspace(1)* %out) { + call void @func(i32 addrspace(1)* %out) + ret void +} + +; CHECK-LABEL: call_protected_func: +; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; CHECK: s_add_u32 s[[ADDR_LO:[0-9]+]], s[[PC_LO]], protected_func@rel32@lo+4 +; CHECK: s_addc_u32 s[[ADDR_HI:[0-9]+]], s[[PC_HI]], protected_func@rel32@hi+4 +; CHECK: s_swappc_b64 s{{\[}}{{[0-9]+:[0-9]+}}{{\]}}, s{{\[}}[[ADDR_LO]]:[[ADDR_HI]]{{\]}} +define amdgpu_kernel void @call_protected_func(i32 addrspace(1)* %out) { + call void @protected_func(i32 addrspace(1)* %out) + ret void +} + +; CHECK-LABEL: call_hidden_func: +; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; CHECK: s_add_u32 s[[ADDR_LO:[0-9]+]], s[[PC_LO]], hidden_func@rel32@lo+4 +; CHECK: s_addc_u32 s[[ADDR_HI:[0-9]+]], s[[PC_HI]], hidden_func@rel32@hi+4 +; CHECK: s_swappc_b64 s{{\[}}{{[0-9]+:[0-9]+}}{{\]}}, s{{\[}}[[ADDR_LO]]:[[ADDR_HI]]{{\]}} +define amdgpu_kernel void @call_hidden_func(i32 addrspace(1)* %out) { + call void @hidden_func(i32 addrspace(1)* %out) + ret void +} + +attributes #0 = { noinline } Index: test/CodeGen/AMDGPU/ipra.ll =================================================================== --- test/CodeGen/AMDGPU/ipra.ll +++ test/CodeGen/AMDGPU/ipra.ll @@ -12,7 +12,7 @@ ; GCN-LABEL: {{^}}func: ; GCN: ; NumVgprs: 8 -define void @func() #1 { +define hidden void @func() #1 { call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0 ret void } Index: test/CodeGen/AMDGPU/mem-builtins.ll =================================================================== --- test/CodeGen/AMDGPU/mem-builtins.ll +++ test/CodeGen/AMDGPU/mem-builtins.ll @@ -1,12 +1,12 @@ ; RUN: not llc -march=r600 < %s 2>&1 | FileCheck -check-prefix=ERROR %s ; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s -declare i32 @memcmp(i8 addrspace(1)* readonly nocapture, i8 addrspace(1)* readonly nocapture, i64) #0 -declare i8 addrspace(1)* @memchr(i8 addrspace(1)* readonly nocapture, i32, i64) #1 -declare i8* @strcpy(i8* nocapture, i8* readonly nocapture) #0 -declare i32 @strlen(i8* nocapture) #1 -declare i32 @strnlen(i8* nocapture, i32) #1 -declare i32 @strcmp(i8* nocapture, i8* nocapture) #1 +declare hidden i32 @memcmp(i8 addrspace(1)* readonly nocapture, i8 addrspace(1)* readonly nocapture, i64) #0 +declare hidden i8 addrspace(1)* @memchr(i8 addrspace(1)* readonly nocapture, i32, i64) #1 +declare hidden i8* @strcpy(i8* nocapture, i8* readonly nocapture) #0 +declare hidden i32 @strlen(i8* nocapture) #1 +declare hidden i32 @strnlen(i8* nocapture, i32) #1 +declare hidden i32 @strcmp(i8* nocapture, i8* nocapture) #1 ; ERROR: error: :0:0: in function test_memcmp void (i8 addrspace(1)*, i8 addrspace(1)*, i32*): unsupported call to function memcmp Index: test/CodeGen/AMDGPU/sibling-call.ll =================================================================== --- test/CodeGen/AMDGPU/sibling-call.ll +++ test/CodeGen/AMDGPU/sibling-call.ll @@ -32,7 +32,7 @@ } ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32: -define fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 { +define hidden fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 { entry: %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) ret i32 %ret @@ -92,7 +92,7 @@ ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] -define fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, i32 addrspace(5)* byval align 4 %arg1) #1 { +define hidden fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, i32 addrspace(5)* byval align 4 %arg1) #1 { %arg1.load = load i32, i32 addrspace(5)* %arg1, align 4 %add0 = add i32 %arg0, %arg1.load ret i32 %add0