Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -106,7 +106,7 @@ AMDGPUTargetStreamer *TS = static_cast(OutStreamer->getTargetStreamer()); - TS->EmitDirectiveHSACodeObjectVersion(2, 0); + TS->EmitDirectiveHSACodeObjectVersion(2, 1); AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI->getFeatureBits()); TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping, Index: lib/Target/AMDGPU/AMDGPUMCInstLower.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -39,6 +39,13 @@ Ctx(ctx), ST(st) { } +static MCSymbolRefExpr::VariantKind getVariantKind(unsigned MOFlags) { + switch (MOFlags) { + default: return MCSymbolRefExpr::VK_None; + case SIInstrInfo::MO_GOTPCREL: return MCSymbolRefExpr::VK_GOTPCREL; + } +} + void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(MI->getOpcode()); @@ -69,7 +76,8 @@ case MachineOperand::MO_GlobalAddress: { const GlobalValue *GV = MO.getGlobal(); MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(GV->getName())); - const MCExpr *SymExpr = MCSymbolRefExpr::create(Sym, Ctx); + const MCExpr *SymExpr = + MCSymbolRefExpr::create(Sym, getVariantKind(MO.getTargetFlags()),Ctx); const MCExpr *Expr = MCBinaryExpr::createAdd(SymExpr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx); MCOp = MCOperand::createExpr(Expr); Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -107,9 +107,9 @@ } static Reloc::Model getEffectiveRelocModel(Optional RM) { - if (!RM.hasValue()) - return Reloc::PIC_; - return *RM; + // The AMDGPU toolchain only supports generating shared objects, so we + // must always use PIC. + return Reloc::PIC_; } AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -27,6 +27,7 @@ #include "SIRegisterInfo.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -500,9 +501,11 @@ const Value *Ptr = MemNode->getMemOperand()->getValue(); // UndefValue means this is a load of a kernel input. These are uniform. - // Sometimes LDS instructions have constant pointers - if (isa(Ptr) || isa(Ptr) || isa(Ptr) || - isa(Ptr)) + // Sometimes LDS instructions have constant pointers. + // If Ptr is null, then thatg means this mem operand contains a + // PseudoSourceValue like GOT. + if (!Ptr || isa(Ptr) || isa(Ptr) || + isa(Ptr) || isa(Ptr)) return true; const Instruction *I = dyn_cast_or_null(Ptr); @@ -1422,27 +1425,26 @@ return DAG.getUNDEF(ASC->getValueType(0)); } -bool -SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { - if (GA->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS) - return false; - - return TargetLowering::isOffsetFoldingLegal(GA); -} - -SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, - SDValue Op, - SelectionDAG &DAG) const { - GlobalAddressSDNode *GSD = cast(Op); +SDValue SITargetLowering::lowerTRAP(SDValue Op, + SelectionDAG &DAG) const { + const MachineFunction &MF = DAG.getMachineFunction(); + DiagnosticInfoUnsupported NoTrap(*MF.getFunction(), + "trap handler not supported", + Op.getDebugLoc(), + DS_Warning); + DAG.getContext()->diagnose(NoTrap); - if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS && - GSD->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS) - return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); + // Emit s_endpgm. - SDLoc DL(GSD); - const GlobalValue *GV = GSD->getGlobal(); - EVT PtrVT = Op.getValueType(); + // FIXME: This should really be selected to s_trap, but that requires + // setting up the trap handler for it o do anything. + return DAG.getNode(AMDGPUISD::ENDPGM, SDLoc(Op), MVT::Other, + Op.getOperand(0)); +} +static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, + SDLoc DL, unsigned Offset, EVT PtrVT, + unsigned GAFlags = SIInstrInfo::MO_NONE) { // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is // lowered to the following code sequence: // s_getpc_b64 s[0:1] @@ -1460,26 +1462,54 @@ // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too // small. This requires us to add 4 to the global variable offset in order to // compute the correct address. - SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, - GSD->getOffset() + 4); + SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, + GAFlags); return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, GA); } -SDValue SITargetLowering::lowerTRAP(SDValue Op, - SelectionDAG &DAG) const { - const MachineFunction &MF = DAG.getMachineFunction(); - DiagnosticInfoUnsupported NoTrap(*MF.getFunction(), - "trap handler not supported", - Op.getDebugLoc(), - DS_Warning); - DAG.getContext()->diagnose(NoTrap); +static bool shouldEmitGOTReloc(const GlobalValue *GV, + const TargetMachine &TM) { + return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && + !shouldAssumeDSOLocal(TM.getRelocationModel(), TM.getTargetTriple(), + *GV->getParent(), GV); +} - // Emit s_endpgm. +bool +SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { + // We can fold offsets for anything that doesn't require a GOT relocation. + return GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && + !shouldEmitGOTReloc(GA->getGlobal(), getTargetMachine()); +} - // FIXME: This should really be selected to s_trap, but that requires - // setting up the trap handler for it o do anything. - return DAG.getNode(AMDGPUISD::ENDPGM, SDLoc(Op), MVT::Other, - Op.getOperand(0)); +SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, + SDValue Op, + SelectionDAG &DAG) const { + GlobalAddressSDNode *GSD = cast(Op); + + if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS && + GSD->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS) + return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); + + const SIMachineFunctionInfo *SIInfo = + static_cast(MFI); + SDLoc DL(GSD); + const GlobalValue *GV = GSD->getGlobal(); + EVT PtrVT = Op.getValueType(); + + if (!shouldEmitGOTReloc(GV, getTargetMachine())) + return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT); + + SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT, + SIInstrInfo::MO_GOTPCREL); + + Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext()); + PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); + const DataLayout &DataLayout = DAG.getDataLayout(); + unsigned Align = DataLayout.getABITypeAlignment(PtrTy); + MachinePointerInfo PtrInfo(SIInfo->getGOT()); + + return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, + PtrInfo, false, false, true, Align); } SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -91,6 +91,12 @@ unsigned OpIdx1) const override; public: + + enum TargetOperandFlags { + MO_NONE = 0, + MO_GOTPCREL = 1 + }; + explicit SIInstrInfo(const AMDGPUSubtarget &st); const SIRegisterInfo &getRegisterInfo() const override { Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -16,6 +16,7 @@ #include "AMDGPUMachineFunction.h" #include "SIRegisterInfo.h" +#include "llvm/CodeGen/PseudoSourceValue.h" #include namespace llvm { @@ -102,6 +103,7 @@ bool WorkItemIDY : 1; bool WorkItemIDZ : 1; + const PseudoSourceValue GOTPSV; MCPhysReg getNextUserSGPR() const { assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs"); @@ -335,6 +337,8 @@ } unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const; + + const PseudoSourceValue *getGOT() const { return &GOTPSV; } }; } // End namespace llvm Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -74,7 +74,8 @@ PrivateSegmentWaveByteOffset(false), WorkItemIDX(false), WorkItemIDY(false), - WorkItemIDZ(false) { + WorkItemIDZ(false), + GOTPSV(PseudoSourceValue::GOT, AMDGPUAS::CONSTANT_ADDRESS) { const AMDGPUSubtarget &ST = MF.getSubtarget(); const Function *F = MF.getFunction(); Index: test/CodeGen/AMDGPU/global-variable-relocs.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/global-variable-relocs.ll @@ -0,0 +1,203 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji < %s | FileCheck %s + +@private = private addrspace(1) global [256 x i32] zeroinitializer +@internal = internal addrspace(1) global [256 x i32] zeroinitializer +@available_externally = available_externally addrspace(1) global [256 x i32] zeroinitializer +@linkonce = linkonce addrspace(1) global [256 x i32] zeroinitializer +@weak= weak addrspace(1) global [256 x i32] zeroinitializer +@common = common addrspace(1) global [256 x i32] zeroinitializer +@extern_weak = extern_weak addrspace(1) global [256 x i32] +@linkonce_odr = linkonce_odr addrspace(1) global [256 x i32] zeroinitializer +@weak_odr = weak_odr addrspace(1) global [256 x i32] zeroinitializer +@external = external addrspace(1) global [256 x i32] +@external_w_init = addrspace(1) global [256 x i32] zeroinitializer + +; CHECK-LABEL: private_test: +; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; CHECK: s_add_u32 s[[ADDR_LO:[0-9]+]], s[[PC_LO]], private+8 +; CHECK: s_addc_u32 s[[ADDR_HI:[0-9]+]], s[[PC_HI]], 0 +; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[ADDR_LO]] +; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[ADDR_HI]] +; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} +define void @private_test(i32 addrspace(1)* %out) { + %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @private, i32 0, i32 1 + %val = load i32, i32 addrspace(1)* %ptr + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: internal_test: +; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; CHECK: s_add_u32 s[[ADDR_LO:[0-9]+]], s[[PC_LO]], internal+8 +; CHECK: s_addc_u32 s[[ADDR_HI:[0-9]+]], s[[PC_HI]], 0 +; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[ADDR_LO]] +; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[ADDR_HI]] +; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} +define void @internal_test(i32 addrspace(1)* %out) { + %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @internal, i32 0, i32 1 + %val = load i32, i32 addrspace(1)* %ptr + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: available_externally_test: +; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], available_externally@GOTPCREL+4 +; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], 0 +; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0 +; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4 +; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0 +; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] +; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] +; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} +define void @available_externally_test(i32 addrspace(1)* %out) { + %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @available_externally, i32 0, i32 1 + %val = load i32, i32 addrspace(1)* %ptr + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: linkonce_test: +; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], linkonce@GOTPCREL+4 +; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], 0 +; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0 +; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4 +; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0 +; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] +; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] +; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} +define void @linkonce_test(i32 addrspace(1)* %out) { + %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @linkonce, i32 0, i32 1 + %val = load i32, i32 addrspace(1)* %ptr + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: weak_test: +; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], weak@GOTPCREL+4 +; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], 0 +; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0 +; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4 +; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0 +; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] +; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] +; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} +define void @weak_test(i32 addrspace(1)* %out) { + %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @weak, i32 0, i32 1 + %val = load i32, i32 addrspace(1)* %ptr + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: common_test: +; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], common@GOTPCREL+4 +; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], 0 +; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0 +; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4 +; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0 +; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] +; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] +; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} +define void @common_test(i32 addrspace(1)* %out) { + %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @common, i32 0, i32 1 + %val = load i32, i32 addrspace(1)* %ptr + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: extern_weak_test: +; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], extern_weak@GOTPCREL+4 +; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], 0 +; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0 +; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4 +; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0 +; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] +; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] +; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} +define void @extern_weak_test(i32 addrspace(1)* %out) { + %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @extern_weak, i32 0, i32 1 + %val = load i32, i32 addrspace(1)* %ptr + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: linkonce_odr_test: +; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], linkonce_odr@GOTPCREL+4 +; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], 0 +; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0 +; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4 +; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0 +; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] +; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] +; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} +define void @linkonce_odr_test(i32 addrspace(1)* %out) { + %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @linkonce_odr, i32 0, i32 1 + %val = load i32, i32 addrspace(1)* %ptr + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: weak_odr_test: +; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], weak_odr@GOTPCREL+4 +; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], 0 +; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0 +; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4 +; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0 +; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] +; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] +; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} +define void @weak_odr_test(i32 addrspace(1)* %out) { + %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @weak_odr, i32 0, i32 1 + %val = load i32, i32 addrspace(1)* %ptr + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: external_test: +; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], external@GOTPCREL+4 +; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], 0 +; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0 +; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4 +; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0 +; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] +; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] +; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} +define void @external_test(i32 addrspace(1)* %out) { + %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @external, i32 0, i32 1 + %val = load i32, i32 addrspace(1)* %ptr + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: external_w_init_test: +; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], external_w_init@GOTPCREL+4 +; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], 0 +; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0 +; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4 +; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0 +; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] +; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] +; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} +define void @external_w_init_test(i32 addrspace(1)* %out) { + %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @external_w_init, i32 0, i32 1 + %val = load i32, i32 addrspace(1)* %ptr + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; CHECK: .local private +; CHECK: .local internal +; CHECK: .weak linkonce +; CHECK: .weak weak +; CHECK: .weak linkonce_odr +; CHECK: .weak weak_odr +; CHECK-NOT: external{{$}} +; CHECK: .globl external_w_init Index: test/CodeGen/AMDGPU/global-zero-initializer.ll =================================================================== --- test/CodeGen/AMDGPU/global-zero-initializer.ll +++ /dev/null @@ -1,18 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s 2>&1 | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s - -; CHECK: {{^}}load_init_global_global: -; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} -; CHECK: s_add_u32 s[[ADDR_LO:[0-9]+]], s[[PC_LO]], global+4 -; CHECK: s_addc_u32 s5, s[[PC_HI]], 0 -; CHECK: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[ADDR_LO]]:7], 0 offset:40 -; CHECK: global: -; CHECK: .zero 1024 -@global = addrspace(1) global [256 x i32] zeroinitializer - -define void @load_init_global_global(i32 addrspace(1)* %out, i1 %p) { - %gep = getelementptr [256 x i32], [256 x i32] addrspace(1)* @global, i32 0, i32 10 - %ld = load i32, i32 addrspace(1)* %gep - store i32 %ld, i32 addrspace(1)* %out - ret void -} Index: test/CodeGen/AMDGPU/hsa-note-no-func.ll =================================================================== --- test/CodeGen/AMDGPU/hsa-note-no-func.ll +++ test/CodeGen/AMDGPU/hsa-note-no-func.ll @@ -2,7 +2,7 @@ ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo | FileCheck --check-prefix=HSA --check-prefix=HSA-VI %s ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji | FileCheck --check-prefix=HSA --check-prefix=HSA-FIJI %s -; HSA: .hsa_code_object_version 2,0 +; HSA: .hsa_code_object_version 2,1 ; HSA-CI: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU" ; HSA-VI: .hsa_code_object_isa 8,0,1,"AMD","AMDGPU" ; HSA-FIJI: .hsa_code_object_isa 8,0,3,"AMD","AMDGPU" Index: test/CodeGen/AMDGPU/hsa.ll =================================================================== --- test/CodeGen/AMDGPU/hsa.ll +++ test/CodeGen/AMDGPU/hsa.ll @@ -18,7 +18,7 @@ ; ELF: SHT_NOTE ; ELF: 0000: 04000000 08000000 01000000 414D4400 -; ELF: 0010: 02000000 00000000 04000000 1B000000 +; ELF: 0010: 02000000 01000000 04000000 1B000000 ; ELF: 0020: 03000000 414D4400 04000700 07000000 ; ELF: 0030: 00000000 00000000 414D4400 414D4447 ; ELF: 0040: 50550000 @@ -29,7 +29,7 @@ ; ELF: Type: AMDGPU_HSA_KERNEL (0xA) ; ELF: } -; HSA: .hsa_code_object_version 2,0 +; HSA: .hsa_code_object_version 2,1 ; HSA-CI: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU" ; HSA-VI: .hsa_code_object_isa 8,0,1,"AMD","AMDGPU"