Index: lib/Target/AMDGPU/AMDGPULegalizerInfo.h =================================================================== --- lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -16,6 +16,7 @@ #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" #include "AMDGPUArgumentUsageInfo.h" +#include "SIInstrInfo.h" namespace llvm { @@ -58,6 +59,10 @@ bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; + bool buildPCRelGlobalAddress( + Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, + unsigned Offset, unsigned GAFlags = SIInstrInfo::MO_NONE) const; + bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; bool legalizeLoad(MachineInstr &MI, MachineRegisterInfo &MRI, Index: lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -309,7 +309,8 @@ .legalIf(isPointer(0)); setAction({G_FRAME_INDEX, PrivatePtr}, Legal); - getActionDefinitionsBuilder(G_GLOBAL_VALUE).customFor({LocalPtr}); + getActionDefinitionsBuilder(G_GLOBAL_VALUE) + .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); auto &FPOpActions = getActionDefinitionsBuilder( @@ -1509,6 +1510,63 @@ return true; } +bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( + Register DstReg, LLT PtrTy, + MachineIRBuilder &B, const GlobalValue *GV, + unsigned Offset, unsigned GAFlags) const { + // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is + // lowered to the following code sequence: + // + // For constant address space: + // s_getpc_b64 s[0:1] + // s_add_u32 s0, s0, $symbol + // s_addc_u32 s1, s1, 0 + // + // s_getpc_b64 returns the address of the s_add_u32 instruction and then + // a fixup or relocation is emitted to replace $symbol with a literal + // constant, which is a pc-relative offset from the encoding of the $symbol + // operand to the global variable. + // + // For global address space: + // s_getpc_b64 s[0:1] + // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo + // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi + // + // s_getpc_b64 returns the address of the s_add_u32 instruction and then + // fixups or relocations are emitted to replace $symbol@*@lo and + // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, + // which is a 64-bit pc-relative offset from the encoding of the $symbol + // operand to the global variable. + // + // What we want here is an offset from the value returned by s_getpc + // (which is the address of the s_add_u32 instruction) to the global + // variable, but since the encoding of $symbol starts 4 bytes after the start + // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too + // small. This requires us to add 4 to the global variable offset in order to + // compute the correct address. + + LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); + + + Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : + B.getMRI()->createGenericVirtualRegister(ConstPtrTy); + + MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) + .addDef(PCReg); + + MIB.addGlobalAddress(GV, Offset + 4, GAFlags); + if (GAFlags == SIInstrInfo::MO_NONE) + MIB.addImm(0); + else + MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); + + B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); + + if (PtrTy.getSizeInBits() == 32) + B.buildExtract(DstReg, PCReg, 0); + return true; + } + bool AMDGPULegalizerInfo::legalizeGlobalValue( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { @@ -1519,10 +1577,9 @@ const GlobalValue *GV = MI.getOperand(1).getGlobal(); MachineFunction &MF = B.getMF(); SIMachineFunctionInfo *MFI = MF.getInfo(); + B.setInstr(MI); if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { - B.setInstr(MI); - if (!MFI->isEntryFunction()) { const Function &Fn = MF.getFunction(); DiagnosticInfoUnsupported BadLDSDecl( @@ -1536,13 +1593,47 @@ MI.eraseFromParent(); return true; } + + const Function &Fn = MF.getFunction(); + DiagnosticInfoUnsupported BadInit( + Fn, "unsupported initializer for address space", MI.getDebugLoc()); + Fn.getContext().diagnose(BadInit); + return true; + } + + const SITargetLowering *TLI = ST.getTargetLowering(); + + if (TLI->shouldEmitFixup(GV)) { + buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); + MI.eraseFromParent(); + return true; + } + + if (TLI->shouldEmitPCReloc(GV)) { + buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); + MI.eraseFromParent(); + return true; + } + + LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); + Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); + + MachineMemOperand *GOTMMO = MF.getMachineMemOperand( + MachinePointerInfo::getGOT(MF), + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + 8 /*Size*/, 8 /*Align*/); + + buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); + + if (Ty.getSizeInBits() == 32) { + // Truncate if this is a 32-bit constant adrdess. + auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); + B.buildExtract(DstReg, Load, 0); } else - return false; + B.buildLoad(DstReg, GOTAddr, *GOTMMO); - const Function &Fn = MF.getFunction(); - DiagnosticInfoUnsupported BadInit( - Fn, "unsupported initializer for address space", MI.getDebugLoc()); - Fn.getContext().diagnose(BadInit); + MI.eraseFromParent(); return true; } Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -186,6 +186,7 @@ unsigned isCFIntrinsic(const SDNode *Intr) const; +public: /// \returns True if fixup needs to be emitted for given global value \p GV, /// false otherwise. bool shouldEmitFixup(const GlobalValue *GV) const; @@ -198,6 +199,7 @@ /// global value \p GV, false otherwise. bool shouldEmitPCReloc(const GlobalValue *GV) const; +private: // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the // three offsets (voffset, soffset and instoffset) into the SDValue[3] array // pointed to by Offsets. Index: test/CodeGen/AMDGPU/GlobalISel/global-value.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/GlobalISel/global-value.ll @@ -0,0 +1,156 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -stop-after=legalizer < %s | FileCheck -check-prefix=HSA %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -stop-after=legalizer < %s | FileCheck -check-prefix=PAL %s + +@external_constant = external addrspace(4) constant i32, align 4 +@external_constant32 = external addrspace(6) constant i32, align 4 +@external_global = external addrspace(1) global i32, align 4 + +@internal_constant = internal addrspace(4) constant i32 9, align 4 +@internal_constant32 = internal addrspace(6) constant i32 9, align 4 +@internal_global = internal addrspace(1) global i32 9, align 4 + + +define i32 addrspace(4)* @external_constant_got() { + ; HSA-LABEL: name: external_constant_got + ; HSA: bb.1 (%ir-block.0): + ; HSA: liveins: $sgpr30_sgpr31 + ; HSA: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; HSA: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p4) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @external_constant + 4, target-flags(amdgpu-gotprel32-hi) @external_constant + 4, implicit-def $scc + ; HSA: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[SI_PC_ADD_REL_OFFSET]](p4) :: (dereferenceable invariant load 8 from got, addrspace 4) + ; HSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](p4) + ; HSA: $vgpr0 = COPY [[UV]](s32) + ; HSA: $vgpr1 = COPY [[UV1]](s32) + ; HSA: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; HSA: S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1 + ; PAL-LABEL: name: external_constant_got + ; PAL: bb.1 (%ir-block.0): + ; PAL: liveins: $sgpr30_sgpr31 + ; PAL: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; PAL: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p4) = SI_PC_ADD_REL_OFFSET @external_constant + 4, 0, implicit-def $scc + ; PAL: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SI_PC_ADD_REL_OFFSET]](p4) + ; PAL: $vgpr0 = COPY [[UV]](s32) + ; PAL: $vgpr1 = COPY [[UV1]](s32) + ; PAL: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; PAL: S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1 + ret i32 addrspace(4)* @external_constant +} + +define i32 addrspace(1)* @external_global_got() { + ; HSA-LABEL: name: external_global_got + ; HSA: bb.1 (%ir-block.0): + ; HSA: liveins: $sgpr30_sgpr31 + ; HSA: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; HSA: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p4) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @external_global + 4, target-flags(amdgpu-gotprel32-hi) @external_global + 4, implicit-def $scc + ; HSA: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[SI_PC_ADD_REL_OFFSET]](p4) :: (dereferenceable invariant load 8 from got, addrspace 4) + ; HSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](p1) + ; HSA: $vgpr0 = COPY [[UV]](s32) + ; HSA: $vgpr1 = COPY [[UV1]](s32) + ; HSA: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; HSA: S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1 + ; PAL-LABEL: name: external_global_got + ; PAL: bb.1 (%ir-block.0): + ; PAL: liveins: $sgpr30_sgpr31 + ; PAL: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; PAL: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p4) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @external_global + 4, target-flags(amdgpu-gotprel32-hi) @external_global + 4, implicit-def $scc + ; PAL: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[SI_PC_ADD_REL_OFFSET]](p4) :: (dereferenceable invariant load 8 from got, addrspace 4) + ; PAL: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](p1) + ; PAL: $vgpr0 = COPY [[UV]](s32) + ; PAL: $vgpr1 = COPY [[UV1]](s32) + ; PAL: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; PAL: S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1 + ret i32 addrspace(1)* @external_global +} + +define i32 addrspace(4)* @internal_constant_pcrel() { + ; HSA-LABEL: name: internal_constant_pcrel + ; HSA: bb.1 (%ir-block.0): + ; HSA: liveins: $sgpr30_sgpr31 + ; HSA: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; HSA: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p4) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @internal_constant + 4, target-flags(amdgpu-rel32-hi) @internal_constant + 4, implicit-def $scc + ; HSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SI_PC_ADD_REL_OFFSET]](p4) + ; HSA: $vgpr0 = COPY [[UV]](s32) + ; HSA: $vgpr1 = COPY [[UV1]](s32) + ; HSA: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; HSA: S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1 + ; PAL-LABEL: name: internal_constant_pcrel + ; PAL: bb.1 (%ir-block.0): + ; PAL: liveins: $sgpr30_sgpr31 + ; PAL: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; PAL: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p4) = SI_PC_ADD_REL_OFFSET @internal_constant + 4, 0, implicit-def $scc + ; PAL: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SI_PC_ADD_REL_OFFSET]](p4) + ; PAL: $vgpr0 = COPY [[UV]](s32) + ; PAL: $vgpr1 = COPY [[UV1]](s32) + ; PAL: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; PAL: S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1 + ret i32 addrspace(4)* @internal_constant +} + +define i32 addrspace(1)* @internal_global_pcrel() { + ; HSA-LABEL: name: internal_global_pcrel + ; HSA: bb.1 (%ir-block.0): + ; HSA: liveins: $sgpr30_sgpr31 + ; HSA: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; HSA: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p1) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @internal_global + 4, target-flags(amdgpu-rel32-hi) @internal_global + 4, implicit-def $scc + ; HSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SI_PC_ADD_REL_OFFSET]](p1) + ; HSA: $vgpr0 = COPY [[UV]](s32) + ; HSA: $vgpr1 = COPY [[UV1]](s32) + ; HSA: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; HSA: S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1 + ; PAL-LABEL: name: internal_global_pcrel + ; PAL: bb.1 (%ir-block.0): + ; PAL: liveins: $sgpr30_sgpr31 + ; PAL: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; PAL: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p1) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @internal_global + 4, target-flags(amdgpu-rel32-hi) @internal_global + 4, implicit-def $scc + ; PAL: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SI_PC_ADD_REL_OFFSET]](p1) + ; PAL: $vgpr0 = COPY [[UV]](s32) + ; PAL: $vgpr1 = COPY [[UV1]](s32) + ; PAL: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; PAL: S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1 + ret i32 addrspace(1)* @internal_global +} + +define i32 addrspace(6)* @external_constant32_got() { + ; HSA-LABEL: name: external_constant32_got + ; HSA: bb.1 (%ir-block.0): + ; HSA: liveins: $sgpr30_sgpr31 + ; HSA: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; HSA: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p4) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @external_constant32 + 4, target-flags(amdgpu-gotprel32-hi) @external_constant32 + 4, implicit-def $scc + ; HSA: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[SI_PC_ADD_REL_OFFSET]](p4) :: (dereferenceable invariant load 8 from got, addrspace 4) + ; HSA: [[EXTRACT:%[0-9]+]]:_(p6) = G_EXTRACT [[LOAD]](p4), 0 + ; HSA: $vgpr0 = COPY [[EXTRACT]](p6) + ; HSA: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; HSA: S_SETPC_B64_return [[COPY1]], implicit $vgpr0 + ; PAL-LABEL: name: external_constant32_got + ; PAL: bb.1 (%ir-block.0): + ; PAL: liveins: $sgpr30_sgpr31 + ; PAL: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; PAL: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p4) = SI_PC_ADD_REL_OFFSET @external_constant32 + 4, 0, implicit-def $scc + ; PAL: [[EXTRACT:%[0-9]+]]:_(p6) = G_EXTRACT [[SI_PC_ADD_REL_OFFSET]](p4), 0 + ; PAL: $vgpr0 = COPY [[EXTRACT]](p6) + ; PAL: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; PAL: S_SETPC_B64_return [[COPY1]], implicit $vgpr0 + ret i32 addrspace(6)* @external_constant32 +} + +define i32 addrspace(6)* @internal_constant32_pcrel() { + ; HSA-LABEL: name: internal_constant32_pcrel + ; HSA: bb.1 (%ir-block.0): + ; HSA: liveins: $sgpr30_sgpr31 + ; HSA: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; HSA: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p4) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @internal_constant32 + 4, target-flags(amdgpu-rel32-hi) @internal_constant32 + 4, implicit-def $scc + ; HSA: [[EXTRACT:%[0-9]+]]:_(p6) = G_EXTRACT [[SI_PC_ADD_REL_OFFSET]](p4), 0 + ; HSA: $vgpr0 = COPY [[EXTRACT]](p6) + ; HSA: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; HSA: S_SETPC_B64_return [[COPY1]], implicit $vgpr0 + ; PAL-LABEL: name: internal_constant32_pcrel + ; PAL: bb.1 (%ir-block.0): + ; PAL: liveins: $sgpr30_sgpr31 + ; PAL: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; PAL: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p4) = SI_PC_ADD_REL_OFFSET @internal_constant32 + 4, 0, implicit-def $scc + ; PAL: [[EXTRACT:%[0-9]+]]:_(p6) = G_EXTRACT [[SI_PC_ADD_REL_OFFSET]](p4), 0 + ; PAL: $vgpr0 = COPY [[EXTRACT]](p6) + ; PAL: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; PAL: S_SETPC_B64_return [[COPY1]], implicit $vgpr0 + ret i32 addrspace(6)* @internal_constant32 +}