Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -121,7 +121,7 @@ bool selectG_STORE(MachineInstr &I) const; bool selectG_SELECT(MachineInstr &I) const; bool selectG_BRCOND(MachineInstr &I) const; - bool selectG_FRAME_INDEX(MachineInstr &I) const; + bool selectG_FRAME_INDEX_GLOBAL_VALUE(MachineInstr &I) const; bool selectG_PTR_MASK(MachineInstr &I) const; bool selectG_EXTRACT_VECTOR_ELT(MachineInstr &I) const; bool selectG_INSERT_VECTOR_ELT(MachineInstr &I) const; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1796,7 +1796,8 @@ return true; } -bool AMDGPUInstructionSelector::selectG_FRAME_INDEX(MachineInstr &I) const { +bool AMDGPUInstructionSelector::selectG_FRAME_INDEX_GLOBAL_VALUE( + MachineInstr &I) const { Register DstReg = I.getOperand(0).getReg(); const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; @@ -2126,7 +2127,8 @@ case TargetOpcode::G_BRCOND: return selectG_BRCOND(I); case TargetOpcode::G_FRAME_INDEX: - return selectG_FRAME_INDEX(I); + case TargetOpcode::G_GLOBAL_VALUE: + return selectG_FRAME_INDEX_GLOBAL_VALUE(I); case TargetOpcode::G_PTR_MASK: return selectG_PTR_MASK(I); case TargetOpcode::G_EXTRACT_VECTOR_ELT: Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1688,6 +1688,12 @@ // TODO: We could emit code to handle the initialization somewhere. if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { + const SITargetLowering *TLI = ST.getTargetLowering(); + if (!TLI->shouldUseLDSConstAddress(GV)) { + MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); + return true; // Leave in place; + } + B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); MI.eraseFromParent(); return true; Index: llvm/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -199,6 +199,10 @@ /// global value \p GV, false otherwise. bool shouldEmitPCReloc(const GlobalValue *GV) const; + /// \returns true if this should use a literal constant for an LDS address, + /// and not emit a relocation for an LDS global. + bool shouldUseLDSConstAddress(const GlobalValue *GV) const; + private: // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the // three offsets (voffset, soffset and instoffset) into the SDValue[3] array Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4411,6 +4411,14 @@ return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV); } +bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const { + if (!GV->hasExternalLinkage()) + return true; + + const auto OS = getTargetMachine().getTargetTriple().getOS(); + return OS == Triple::AMDHSA || OS == Triple::AMDPAL; +} + /// This transforms the control flow intrinsics to get the branch destination as /// last parameter, also switches branch target with BR if the need arise SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, @@ -5046,9 +5054,7 @@ GlobalAddressSDNode *GSD = cast(Op); const GlobalValue *GV = GSD->getGlobal(); if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && - (!GV->hasExternalLinkage() || - getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA || - getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL)) || + shouldUseLDSConstAddress(GV)) || GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS || GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); Index: llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll @@ -0,0 +1,28 @@ +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s +; FIXME: Merge with DAG test + +@lds.external = external unnamed_addr addrspace(3) global [0 x i32] +@lds.defined = unnamed_addr addrspace(3) global [8 x i32] undef, align 8 + +; GCN-LABEL: {{^}}test_basic: +; GCN: s_add_u32 s0, lds.defined@abs32@lo, s0 ; encoding: [0xff,0x00,0x00,0x80,A,A,A,A] +; GCN: v_add_u32_e32 v0, lds.external@abs32@lo, v0 ; encoding: [0xff,0x00,0x00,0x68,A,A,A,A] + +; GCN: .globl lds.external +; GCN: .amdgpu_lds lds.external, 0, 4 +; GCN: .globl lds.defined +; GCN: .amdgpu_lds lds.defined, 32, 8 +define amdgpu_gs float @test_basic(i32 inreg %wave, i32 %arg1) #0 { +main_body: + %gep0 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @lds.external, i32 0, i32 %arg1 + %tmp = load i32, i32 addrspace(3)* %gep0 + + %gep1 = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds.defined, i32 0, i32 %wave + store i32 123, i32 addrspace(3)* %gep1 + + %r = bitcast i32 %tmp to float + ret float %r +} + +attributes #0 = { "no-signed-zeros-fp-math"="true" } +attributes #4 = { convergent nounwind readnone } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll @@ -1,5 +1,3 @@ ; RUN: not llc -global-isel -march=amdgcn -mcpu=tonga < %S/../lds-zero-initializer.ll 2>&1 | FileCheck %s -; FIXME: Select should succeed ; CHECK: error: :0:0: in function load_zeroinit_lds_global void (i32 addrspace(1)*, i1): unsupported initializer for address space -; CHECK: LLVM ERROR: cannot select: %16:sreg_32(p3) = G_GLOBAL_VALUE @lds (in function: load_zeroinit_lds_global) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll @@ -1165,7 +1165,7 @@ ret void } -@lds0 = addrspace(3) global [512 x i32] undef +@lds0 = internal addrspace(3) global [512 x i32] undef define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { ; CI-LABEL: atomic_dec_shl_base_lds_0: @@ -1750,7 +1750,7 @@ ret void } -@lds1 = addrspace(3) global [512 x i64] undef, align 8 +@lds1 = internal addrspace(3) global [512 x i64] undef, align 8 define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { ; CI-LABEL: atomic_dec_shl_base_lds_0_i64: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll @@ -516,7 +516,7 @@ ret void } -@lds0 = addrspace(3) global [512 x i32] undef, align 4 +@lds0 = internal addrspace(3) global [512 x i32] undef, align 4 define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { ; CI-LABEL: atomic_inc_shl_base_lds_0_i32: @@ -1435,7 +1435,7 @@ ret void } -@lds1 = addrspace(3) global [512 x i64] undef, align 8 +@lds1 = internal addrspace(3) global [512 x i64] undef, align 8 define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { ; CI-LABEL: atomic_inc_shl_base_lds_0_i64: Index: llvm/test/CodeGen/AMDGPU/lds-relocs.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lds-relocs.ll +++ llvm/test/CodeGen/AMDGPU/lds-relocs.ll @@ -47,10 +47,8 @@ %gep0 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @lds.external, i32 0, i32 %arg1 %tmp = load i32, i32 addrspace(3)* %gep0 - %mask = call i64 @llvm.amdgcn.icmp.i64.i32(i32 %tmp, i32 0, i32 0) - %mask.32 = trunc i64 %mask to i32 %gep1 = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds.defined, i32 0, i32 %wave - store i32 %mask.32, i32 addrspace(3)* %gep1 + store i32 123, i32 addrspace(3)* %gep1 %r = bitcast i32 %tmp to float ret float %r