Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp @@ -9,10 +9,10 @@ #include "AMDGPUTargetObjectFile.h" #include "AMDGPU.h" -#include "Utils/AMDGPUBaseInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/Support/ELF.h" +#include "Utils/AMDGPUBaseInfo.h" using namespace llvm; @@ -22,7 +22,8 @@ MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal( const GlobalValue *GV, SectionKind Kind, const TargetMachine &TM) const { - if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GV)) + if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GV) && + AMDGPU::shouldEmitConstantsToTextSection(TM.getTargetTriple())) return TextSection; return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, TM); Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h @@ -78,6 +78,19 @@ bool isCFIntrinsic(const SDNode *Intr) const; void createDebuggerPrologueStackObjects(MachineFunction &MF) const; + + /// \returns True if fixup needs to be emitted for given global value \p GV, + /// false otherwise. + bool shouldEmitFixup(const GlobalValue *GV) const; + + /// \returns True if GOT relocation needs to be emitted for given global value + /// \p GV, false otherwise. + bool shouldEmitGOTReloc(const GlobalValue *GV) const; + + /// \returns True if PC-relative relocation needs to be emitted for given + /// global value \p GV, false otherwise. + bool shouldEmitPCReloc(const GlobalValue *GV) const; + public: SITargetLowering(const TargetMachine &tm, const SISubtarget &STI); Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1813,6 +1813,23 @@ } } +bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const { + const Triple &TT = getTargetMachine().getTargetTriple(); + return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + AMDGPU::shouldEmitConstantsToTextSection(TT); +} + +bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const { + return (GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || + GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) && + !shouldEmitFixup(GV) && + !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV); +} + +bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const { + return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV); +} + /// This transforms the control flow intrinsics to get the branch destination as /// last parameter, also switches branch target with BR if the need arise SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, @@ -1997,29 +2014,12 @@ return DAG.getUNDEF(ASC->getValueType(0)); } -static bool shouldEmitFixup(const GlobalValue *GV, - const TargetMachine &TM) { - // FIXME: We need to emit global variables in constant address space in a - // separate section, and use relocations. - return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; -} - -static bool shouldEmitGOTReloc(const GlobalValue *GV, - const TargetMachine &TM) { - return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && - !TM.shouldAssumeDSOLocal(*GV->getParent(), GV); -} - -static bool shouldEmitPCReloc(const GlobalValue *GV, - const TargetMachine &TM) { - return !shouldEmitFixup(GV, TM) && !shouldEmitGOTReloc(GV, TM); -} - bool SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { // We can fold offsets for anything that doesn't require a GOT relocation. - return GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && - !shouldEmitGOTReloc(GA->getGlobal(), getTargetMachine()); + return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || + GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) && + !shouldEmitGOTReloc(GA->getGlobal()); } static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, @@ -2076,9 +2076,9 @@ const GlobalValue *GV = GSD->getGlobal(); EVT PtrVT = Op.getValueType(); - if (shouldEmitFixup(GV, getTargetMachine())) + if (shouldEmitFixup(GV)) return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT); - else if (shouldEmitPCReloc(GV, getTargetMachine())) + else if (shouldEmitPCReloc(GV)) return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT, SIInstrInfo::MO_REL32); Index: llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -55,6 +55,10 @@ bool isGlobalSegment(const GlobalValue *GV); bool isReadOnlySegment(const GlobalValue *GV); +/// \returns True if constants should be emitted to .text section for given +/// target triple \p TT, false otherwise. +bool shouldEmitConstantsToTextSection(const Triple &TT); + /// \returns Integer value requested using \p F's \p Name attribute. /// /// \returns \p Default if attribute is not present. Index: llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -164,6 +164,10 @@ return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; } +bool shouldEmitConstantsToTextSection(const Triple &TT) { + return TT.getOS() != Triple::AMDHSA; +} + int getIntegerAttribute(const Function &F, StringRef Name, int Default) { Attribute A = F.getFnAttribute(Name); int Result = Default; Index: llvm/trunk/test/CodeGen/AMDGPU/global-constant.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/global-constant.ll +++ llvm/trunk/test/CodeGen/AMDGPU/global-constant.ll @@ -1,27 +1,54 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NOHSA %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=HSA %s -@readonly = private unnamed_addr addrspace(2) constant [4 x float] [float 0.0, float 1.0, float 2.0, float 3.0] -@readonly2 = private unnamed_addr addrspace(2) constant [4 x float] [float 4.0, float 5.0, float 6.0, float 7.0] +@private1 = private unnamed_addr addrspace(2) constant [4 x float] [float 0.0, float 1.0, float 2.0, float 3.0] +@private2 = private unnamed_addr addrspace(2) constant [4 x float] [float 4.0, float 5.0, float 6.0, float 7.0] +@available_externally = available_externally addrspace(2) global [256 x i32] zeroinitializer -; GCN-LABEL: {{^}}main: +; GCN-LABEL: {{^}}private_test: ; GCN: s_getpc_b64 s{{\[}}[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]{{\]}} -; GCN-NEXT: s_add_u32 s{{[0-9]+}}, s[[PC0_LO]], readonly -; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC0_HI]], 0 + +; Non-HSA OSes use fixup into .text section. +; NOHSA: s_add_u32 s{{[0-9]+}}, s[[PC0_LO]], private1 +; NOHSA: s_addc_u32 s{{[0-9]+}}, s[[PC0_HI]], 0 + +; HSA OSes use relocations. +; HSA: s_add_u32 s{{[0-9]+}}, s[[PC0_LO]], private1@rel32@lo+4 +; HSA: s_addc_u32 s{{[0-9]+}}, s[[PC0_HI]], private1@rel32@hi+4 + ; GCN: s_getpc_b64 s{{\[}}[[PC1_LO:[0-9]+]]:[[PC1_HI:[0-9]+]]{{\]}} -; GCN-NEXT: s_add_u32 s{{[0-9]+}}, s[[PC1_LO]], readonly -; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC1_HI]], 0 -; NOHSA: .text -; HSA: .text -; GCN: readonly: -; GCN: readonly2: -define void @main(i32 %index, float addrspace(1)* %out) { - %ptr = getelementptr [4 x float], [4 x float] addrspace(2) * @readonly, i32 0, i32 %index + +; Non-HSA OSes use fixup into .text section. +; NOHSA: s_add_u32 s{{[0-9]+}}, s[[PC1_LO]], private2 +; NOHSA: s_addc_u32 s{{[0-9]+}}, s[[PC1_HI]], 0 + +; HSA OSes use relocations. +; HSA: s_add_u32 s{{[0-9]+}}, s[[PC1_LO]], private2@rel32@lo+4 +; HSA: s_addc_u32 s{{[0-9]+}}, s[[PC1_HI]], private2@rel32@hi+4 + +define void @private_test(i32 %index, float addrspace(1)* %out) { + %ptr = getelementptr [4 x float], [4 x float] addrspace(2) * @private1, i32 0, i32 %index %val = load float, float addrspace(2)* %ptr store float %val, float addrspace(1)* %out - %ptr2 = getelementptr [4 x float], [4 x float] addrspace(2) * @readonly2, i32 0, i32 %index + %ptr2 = getelementptr [4 x float], [4 x float] addrspace(2) * @private2, i32 0, i32 %index %val2 = load float, float addrspace(2)* %ptr2 store float %val2, float addrspace(1)* %out ret void } +; HSA-LABEL: {{^}}available_externally_test: +; HSA: s_getpc_b64 s{{\[}}[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]{{\]}} +; HSA: s_add_u32 s{{[0-9]+}}, s[[PC0_LO]], available_externally@gotpcrel32@lo+4 +; HSA: s_addc_u32 s{{[0-9]+}}, s[[PC0_HI]], available_externally@gotpcrel32@hi+4 +define void @available_externally_test(i32 addrspace(1)* %out) { + %ptr = getelementptr [256 x i32], [256 x i32] addrspace(2)* @available_externally, i32 0, i32 1 + %val = load i32, i32 addrspace(2)* %ptr + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; NOHSA: .text +; HSA: .section .rodata + +; GCN: private1: +; GCN: private2: Index: llvm/trunk/test/CodeGen/AMDGPU/hsa-globals.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/hsa-globals.ll +++ llvm/trunk/test/CodeGen/AMDGPU/hsa-globals.ll @@ -38,7 +38,7 @@ ; ASM: .size external_global_program, 4 ; ASM: .type internal_readonly,@object -; ASM: .text +; ASM: .section .rodata.cst4,"aM",@progbits,4 ; ASM: internal_readonly: ; ASM: .long 0 ; ASM: .size internal_readonly, 4 Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.memcpy.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.memcpy.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.memcpy.ll @@ -333,13 +333,13 @@ ; FUNC-LABEL: {{^}}test_memcpy_const_string_align4: ; SI: s_getpc_b64 -; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, hello.align4+4 +; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, hello.align4+20 ; SI: s_addc_u32 -; SI: s_load_dwordx4 -; SI: s_load_dwordx4 -; SI: s_load_dwordx2 -; SI: buffer_store_dwordx4 -; SI: buffer_store_dwordx4 +; SI-DAG: s_load_dwordx4 +; SI-DAG: s_load_dwordx4 +; SI-DAG: s_load_dwordx2 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 define void @test_memcpy_const_string_align4(i8 addrspace(1)* noalias %out) nounwind { %str = bitcast [16 x i8] addrspace(2)* @hello.align4 to i8 addrspace(2)* call void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* %out, i8 addrspace(2)* %str, i64 32, i32 4, i1 false)