Index: lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -69,8 +69,6 @@ [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisVT<0, i32>, SDTCisVT<1, i1>, SDTCisVT<4, i1>] >; -def SDT_AMDGPUTCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>; - //===----------------------------------------------------------------------===// // AMDGPU DAG Nodes // @@ -95,7 +93,8 @@ SDNPVariadic] >; -def AMDGPUtc_return: SDNode<"AMDGPUISD::TC_RETURN", SDT_AMDGPUTCRET, +def AMDGPUtc_return: SDNode<"AMDGPUISD::TC_RETURN", + SDTypeProfile<0, 3, [SDTCisPtrTy<0>]>, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic] >; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -2711,6 +2711,11 @@ std::vector Ops; Ops.push_back(Chain); Ops.push_back(Callee); + // Add a redundant copy of the callee global which will not be legalized, as + // we need direct access to the callee later. + GlobalAddressSDNode *GSD = cast(Callee); + const GlobalValue *GV = GSD->getGlobal(); + Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64)); if (IsTailCall) { // Each tail call may have to adjust the stack by a different amount, so @@ -3474,34 +3479,16 @@ .addReg(Info->getFrameOffsetReg(), RegState::Implicit); return BB; } - case AMDGPU::SI_CALL_ISEL: - case AMDGPU::SI_TCRETURN_ISEL: { + case AMDGPU::SI_CALL_ISEL: { const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); const DebugLoc &DL = MI.getDebugLoc(); - unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF); - MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned GlobalAddrReg = MI.getOperand(0).getReg(); - MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg); - assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET); - - const GlobalValue *G = PCRel->getOperand(1).getGlobal(); + unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF); MachineInstrBuilder MIB; - if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) { - MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg) - .add(MI.getOperand(0)) - .addGlobalAddress(G); - } else { - MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN)) - .add(MI.getOperand(0)) - .addGlobalAddress(G); - - // There is an additional imm operand for tcreturn, but it should be in the - // right place already. - } + MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg); - for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) + for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) MIB.add(MI.getOperand(I)); MIB.cloneMemRefs(MI); @@ -4008,7 +3995,8 @@ } bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const { - return (GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || + return (GV->getValueType()->isFunctionTy() || + GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && !shouldEmitFixup(GV) && Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -373,7 +373,8 @@ // This version is only needed so we can fill in the output regiter in // the custom inserter. def SI_CALL_ISEL : SPseudoInstSI < - (outs), (ins SSrc_b64:$src0), [(AMDGPUcall i64:$src0)]> { + (outs), (ins SSrc_b64:$src0, unknown:$callee), + [(AMDGPUcall i64:$src0, tglobaladdr:$callee)]> { let Size = 4; let isCall = 1; let SchedRW = [WriteBranch]; @@ -391,20 +392,9 @@ } // Tail call handling pseudo -def SI_TCRETURN_ISEL : SPseudoInstSI<(outs), - (ins SSrc_b64:$src0, i32imm:$fpdiff), - [(AMDGPUtc_return i64:$src0, i32:$fpdiff)]> { - let isCall = 1; - let isTerminator = 1; - let isReturn = 1; - let isBarrier = 1; - let SchedRW = [WriteBranch]; - let usesCustomInserter = 1; -} - -def SI_TCRETURN : SPseudoInstSI < - (outs), - (ins SSrc_b64:$src0, unknown:$callee, i32imm:$fpdiff)> { +def SI_TCRETURN : SPseudoInstSI <(outs), + (ins SSrc_b64:$src0, unknown:$callee, i32imm:$fpdiff), + [(AMDGPUtc_return i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> { let Size = 4; let isCall = 1; let isTerminator = 1; Index: test/CodeGen/AMDGPU/byval-frame-setup.ll =================================================================== --- test/CodeGen/AMDGPU/byval-frame-setup.ll +++ test/CodeGen/AMDGPU/byval-frame-setup.ll @@ -14,7 +14,7 @@ ; GCN-NOT: s32 ; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s5 offset:20{{$}} ; GCN-NOT: s32 -define void @void_func_byval_struct(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1) #1 { +define hidden void @void_func_byval_struct(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1) #1 { entry: %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0 %tmp = load volatile i32, i32 addrspace(5)* %arrayidx, align 4 @@ -193,7 +193,7 @@ ; GCN-NOT: s32 ; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s5 offset:24{{$}} ; GCN-NOT: s32 -define void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg1) #1 { +define hidden void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg1) #1 { entry: %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0 %tmp = load volatile i32, i32 addrspace(5)* %arrayidx, align 8 Index: test/CodeGen/AMDGPU/call-argument-types.ll =================================================================== --- test/CodeGen/AMDGPU/call-argument-types.ll +++ test/CodeGen/AMDGPU/call-argument-types.ll @@ -3,56 +3,56 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,VI,MESA %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,HSA %s -declare void @external_void_func_i1(i1) #0 -declare void @external_void_func_i1_signext(i1 signext) #0 -declare void @external_void_func_i1_zeroext(i1 zeroext) #0 - -declare void @external_void_func_i8(i8) #0 -declare void @external_void_func_i8_signext(i8 signext) #0 -declare void @external_void_func_i8_zeroext(i8 zeroext) #0 - -declare void @external_void_func_i16(i16) #0 -declare void @external_void_func_i16_signext(i16 signext) #0 -declare void @external_void_func_i16_zeroext(i16 zeroext) #0 - -declare void @external_void_func_i32(i32) #0 -declare void @external_void_func_i64(i64) #0 -declare void @external_void_func_v2i64(<2 x i64>) #0 -declare void @external_void_func_v3i64(<3 x i64>) #0 -declare void @external_void_func_v4i64(<4 x i64>) #0 - -declare void @external_void_func_f16(half) #0 -declare void @external_void_func_f32(float) #0 -declare void @external_void_func_f64(double) #0 -declare void @external_void_func_v2f32(<2 x float>) #0 -declare void @external_void_func_v2f64(<2 x double>) #0 -declare void @external_void_func_v3f64(<3 x double>) #0 - -declare void @external_void_func_v2i16(<2 x i16>) #0 -declare void @external_void_func_v2f16(<2 x half>) #0 -declare void @external_void_func_v3i16(<3 x i16>) #0 -declare void @external_void_func_v3f16(<3 x half>) #0 -declare void @external_void_func_v4i16(<4 x i16>) #0 -declare void @external_void_func_v4f16(<4 x half>) #0 - -declare void @external_void_func_v2i32(<2 x i32>) #0 -declare void @external_void_func_v3i32(<3 x i32>) #0 -declare void @external_void_func_v3i32_i32(<3 x i32>, i32) #0 -declare void @external_void_func_v4i32(<4 x i32>) #0 -declare void @external_void_func_v8i32(<8 x i32>) #0 -declare void @external_void_func_v16i32(<16 x i32>) #0 -declare void @external_void_func_v32i32(<32 x i32>) #0 -declare void @external_void_func_v32i32_i32(<32 x i32>, i32) #0 +declare hidden void @external_void_func_i1(i1) #0 +declare hidden void @external_void_func_i1_signext(i1 signext) #0 +declare hidden void @external_void_func_i1_zeroext(i1 zeroext) #0 + +declare hidden void @external_void_func_i8(i8) #0 +declare hidden void @external_void_func_i8_signext(i8 signext) #0 +declare hidden void @external_void_func_i8_zeroext(i8 zeroext) #0 + +declare hidden void @external_void_func_i16(i16) #0 +declare hidden void @external_void_func_i16_signext(i16 signext) #0 +declare hidden void @external_void_func_i16_zeroext(i16 zeroext) #0 + +declare hidden void @external_void_func_i32(i32) #0 +declare hidden void @external_void_func_i64(i64) #0 +declare hidden void @external_void_func_v2i64(<2 x i64>) #0 +declare hidden void @external_void_func_v3i64(<3 x i64>) #0 +declare hidden void @external_void_func_v4i64(<4 x i64>) #0 + +declare hidden void @external_void_func_f16(half) #0 +declare hidden void @external_void_func_f32(float) #0 +declare hidden void @external_void_func_f64(double) #0 +declare hidden void @external_void_func_v2f32(<2 x float>) #0 +declare hidden void @external_void_func_v2f64(<2 x double>) #0 +declare hidden void @external_void_func_v3f64(<3 x double>) #0 + +declare hidden void @external_void_func_v2i16(<2 x i16>) #0 +declare hidden void @external_void_func_v2f16(<2 x half>) #0 +declare hidden void @external_void_func_v3i16(<3 x i16>) #0 +declare hidden void @external_void_func_v3f16(<3 x half>) #0 +declare hidden void @external_void_func_v4i16(<4 x i16>) #0 +declare hidden void @external_void_func_v4f16(<4 x half>) #0 + +declare hidden void @external_void_func_v2i32(<2 x i32>) #0 +declare hidden void @external_void_func_v3i32(<3 x i32>) #0 +declare hidden void @external_void_func_v3i32_i32(<3 x i32>, i32) #0 +declare hidden void @external_void_func_v4i32(<4 x i32>) #0 +declare hidden void @external_void_func_v8i32(<8 x i32>) #0 +declare hidden void @external_void_func_v16i32(<16 x i32>) #0 +declare hidden void @external_void_func_v32i32(<32 x i32>) #0 +declare hidden void @external_void_func_v32i32_i32(<32 x i32>, i32) #0 ; return value and argument -declare i32 @external_i32_func_i32(i32) #0 +declare hidden i32 @external_i32_func_i32(i32) #0 ; Structs -declare void @external_void_func_struct_i8_i32({ i8, i32 }) #0 -declare void @external_void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* byval) #0 -declare void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 } addrspace(5)* sret, { i8, i32 } addrspace(5)* byval) #0 +declare hidden void @external_void_func_struct_i8_i32({ i8, i32 }) #0 +declare hidden void @external_void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* byval) #0 +declare hidden void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 } addrspace(5)* sret, { i8, i32 } addrspace(5)* byval) #0 -declare void @external_void_func_v16i8(<16 x i8>) #0 +declare hidden void @external_void_func_v16i8(<16 x i8>) #0 ; FIXME: Should be passing -1 @@ -764,8 +764,8 @@ ret void } -declare void @byval_align16_f64_arg(<32 x i32>, double addrspace(5)* byval align 16) #0 -declare void @stack_passed_f64_arg(<32 x i32>, double) #0 +declare hidden void @byval_align16_f64_arg(<32 x i32>, double addrspace(5)* byval align 16) #0 +declare hidden void @stack_passed_f64_arg(<32 x i32>, double) #0 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/call-constexpr.ll =================================================================== --- test/CodeGen/AMDGPU/call-constexpr.ll +++ test/CodeGen/AMDGPU/call-constexpr.ll @@ -67,7 +67,7 @@ ; GCN: s_waitcnt ; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: s_setpc_b64 -define i32 @use_workitem_id_x(i32 %arg0) #0 { +define hidden i32 @use_workitem_id_x(i32 %arg0) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %op = add i32 %id, %arg0 ret i32 %op @@ -121,15 +121,15 @@ ; Callees appears last in source file to test that we still lower their ; arguments before we lower any calls to them. -define i32 @ret_i32_noinline() #0 { +define hidden i32 @ret_i32_noinline() #0 { ret i32 4 } -define i32 @ret_i32_alwaysinline() #1 { +define hidden i32 @ret_i32_alwaysinline() #1 { ret i32 4 } -define i32 @ident_i32(i32 %i) #0 { +define hidden i32 @ident_i32(i32 %i) #0 { ret i32 %i } Index: test/CodeGen/AMDGPU/call-preserved-registers.ll =================================================================== --- test/CodeGen/AMDGPU/call-preserved-registers.ll +++ test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -2,7 +2,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -declare void @external_void_func_void() #0 +declare hidden void @external_void_func_void() #0 ; GCN-LABEL: {{^}}test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: ; GCN: s_mov_b32 s33, s7 @@ -84,7 +84,7 @@ ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_setpc_b64 s[30:31] -define void @void_func_void_clobber_vcc() #2 { +define hidden void @void_func_void_clobber_vcc() #2 { call void asm sideeffect "", "~{VCC}"() #0 ret void } @@ -176,7 +176,7 @@ ; GCN-NEXT: #ASMEND ; GCN-NEXT: v_readlane_b32 s33, v0, 0 ; GCN-NEXT: s_setpc_b64 -define void @void_func_void_clobber_s33() #2 { +define hidden void @void_func_void_clobber_s33() #2 { call void asm sideeffect "; clobber", "~{s33}"() #0 ret void } Index: test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll =================================================================== --- test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -166,11 +166,11 @@ } -declare <2 x float> @func_v2f32() #0 -declare <3 x float> @func_v3f32() #0 -declare <4 x float> @func_v4f32() #0 -declare <4 x half> @func_v4f16() #0 +declare hidden <2 x float> @func_v2f32() #0 +declare hidden <3 x float> @func_v3f32() #0 +declare hidden <4 x float> @func_v4f32() #0 +declare hidden <4 x half> @func_v4f16() #0 -declare { <4 x i32>, <4 x half> } @func_struct() #0 +declare hidden { <4 x i32>, <4 x half> } @func_struct() #0 attributes #0 = { nounwind} Index: test/CodeGen/AMDGPU/function-call-relocs.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/function-call-relocs.ll @@ -0,0 +1,49 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji < %s | FileCheck %s + +define void @func(i32 addrspace(1)* %out) #0 { + store i32 1, i32 addrspace(1)* %out + ret void +} + +define protected void @protected_func(i32 addrspace(1)* %out) #0 { + store i32 1, i32 addrspace(1)* %out + ret void +} + +define hidden void @hidden_func(i32 addrspace(1)* %out) #0 { + store i32 1, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: call_func: +; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; CHECK: s_add_u32 s[[GOT_ADDR_LO:[0-9]+]], s[[PC_LO]], func@gotpcrel32@lo+4 +; CHECK: s_addc_u32 s[[GOT_ADDR_HI:[0-9]+]], s[[PC_HI]], func@gotpcrel32@hi+4 +; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOT_ADDR_LO]]:[[GOT_ADDR_HI]]{{\]}}, 0x0 +; CHECK: s_swappc_b64 s{{\[}}{{[0-9]+:[0-9]+}}{{\]}}, s{{\[}}[[ADDR_LO]]:[[ADDR_HI]]{{\]}} +define amdgpu_kernel void @call_func(i32 addrspace(1)* %out) { + call void @func(i32 addrspace(1)* %out) + ret void +} + +; CHECK-LABEL: call_protected_func: +; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; CHECK: s_add_u32 s[[ADDR_LO:[0-9]+]], s[[PC_LO]], protected_func@rel32@lo+4 +; CHECK: s_addc_u32 s[[ADDR_HI:[0-9]+]], s[[PC_HI]], protected_func@rel32@hi+4 +; CHECK: s_swappc_b64 s{{\[}}{{[0-9]+:[0-9]+}}{{\]}}, s{{\[}}[[ADDR_LO]]:[[ADDR_HI]]{{\]}} +define amdgpu_kernel void @call_protected_func(i32 addrspace(1)* %out) { + call void @protected_func(i32 addrspace(1)* %out) + ret void +} + +; CHECK-LABEL: call_hidden_func: +; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; CHECK: s_add_u32 s[[ADDR_LO:[0-9]+]], s[[PC_LO]], hidden_func@rel32@lo+4 +; CHECK: s_addc_u32 s[[ADDR_HI:[0-9]+]], s[[PC_HI]], hidden_func@rel32@hi+4 +; CHECK: s_swappc_b64 s{{\[}}{{[0-9]+:[0-9]+}}{{\]}}, s{{\[}}[[ADDR_LO]]:[[ADDR_HI]]{{\]}} +define amdgpu_kernel void @call_hidden_func(i32 addrspace(1)* %out) { + call void @hidden_func(i32 addrspace(1)* %out) + ret void +} + +attributes #0 = { noinline } Index: test/CodeGen/AMDGPU/ipra.ll =================================================================== --- test/CodeGen/AMDGPU/ipra.ll +++ test/CodeGen/AMDGPU/ipra.ll @@ -12,7 +12,7 @@ ; GCN-LABEL: {{^}}func: ; GCN: ; NumVgprs: 8 -define void @func() #1 { +define hidden void @func() #1 { call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0 ret void } Index: test/CodeGen/AMDGPU/mem-builtins.ll =================================================================== --- test/CodeGen/AMDGPU/mem-builtins.ll +++ test/CodeGen/AMDGPU/mem-builtins.ll @@ -1,12 +1,12 @@ ; RUN: not llc -march=r600 < %s 2>&1 | FileCheck -check-prefix=ERROR %s ; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s -declare i32 @memcmp(i8 addrspace(1)* readonly nocapture, i8 addrspace(1)* readonly nocapture, i64) #0 -declare i8 addrspace(1)* @memchr(i8 addrspace(1)* readonly nocapture, i32, i64) #1 -declare i8* @strcpy(i8* nocapture, i8* readonly nocapture) #0 -declare i32 @strlen(i8* nocapture) #1 -declare i32 @strnlen(i8* nocapture, i32) #1 -declare i32 @strcmp(i8* nocapture, i8* nocapture) #1 +declare hidden i32 @memcmp(i8 addrspace(1)* readonly nocapture, i8 addrspace(1)* readonly nocapture, i64) #0 +declare hidden i8 addrspace(1)* @memchr(i8 addrspace(1)* readonly nocapture, i32, i64) #1 +declare hidden i8* @strcpy(i8* nocapture, i8* readonly nocapture) #0 +declare hidden i32 @strlen(i8* nocapture) #1 +declare hidden i32 @strnlen(i8* nocapture, i32) #1 +declare hidden i32 @strcmp(i8* nocapture, i8* nocapture) #1 ; ERROR: error: :0:0: in function test_memcmp void (i8 addrspace(1)*, i8 addrspace(1)*, i32*): unsupported call to function memcmp Index: test/CodeGen/AMDGPU/sibling-call.ll =================================================================== --- test/CodeGen/AMDGPU/sibling-call.ll +++ test/CodeGen/AMDGPU/sibling-call.ll @@ -32,7 +32,7 @@ } ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32: -define fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 { +define hidden fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 { entry: %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) ret i32 %ret @@ -92,7 +92,7 @@ ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] -define fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, i32 addrspace(5)* byval align 4 %arg1) #1 { +define hidden fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, i32 addrspace(5)* byval align 4 %arg1) #1 { %arg1.load = load i32, i32 addrspace(5)* %arg1, align 4 %add0 = add i32 %arg0, %arg1.load ret i32 %add0