Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h @@ -105,6 +105,8 @@ bool isTypeDesirableForOp(unsigned Op, EVT VT) const override; + bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; + SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1422,6 +1422,14 @@ return DAG.getUNDEF(ASC->getValueType(0)); } +bool +SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { + if (GA->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS) + return false; + + return TargetLowering::isOffsetFoldingLegal(GA); +} + SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const { Index: llvm/trunk/test/CodeGen/AMDGPU/gv-offset-folding.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/gv-offset-folding.ll +++ llvm/trunk/test/CodeGen/AMDGPU/gv-offset-folding.ll @@ -0,0 +1,21 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -relocation-model=static < %s | FileCheck %s + +@lds = external addrspace(3) global [4 x i32] + +; Function Attrs: nounwind + +; Offset folding is an optimization done for global variables with relocations, +; which allows you to store the offset in the r_addend of the relocation entry. +; The offset is apllied to the variables address at link time, which eliminates +; the need to emit shader instructions to do this calculation. +; We don't use relocations for local memory, so we should never fold offsets +; for local memory globals. + +; CHECK-LABEL: lds_no_offset: +; CHECK ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:4 +define void @lds_no_offset() { +entry: + %ptr = getelementptr [4 x i32], [4 x i32] addrspace(3)* @lds, i32 0, i32 1 + store i32 0, i32 addrspace(3)* %ptr + ret void +}