Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -48,6 +48,7 @@ FunctionPass *createSIFixSGPRLiveRangesPass(); FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); FunctionPass *createSIInsertWaits(TargetMachine &tm); +FunctionPass *createSIExpandPostRASchedPseudos(); ModulePass *createAMDGPUAnnotateKernelFeaturesPass(); void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); @@ -78,6 +79,8 @@ void initializeSIFixSGPRLiveRangesPass(PassRegistry&); extern char &SIFixSGPRLiveRangesID; +void initializeSIExpandPostRASchedPseudosPass(PassRegistry&); +extern char &SIExpandPostRASchedPseudosID; extern Target TheAMDGPUTarget; extern Target TheGCNTarget; @@ -92,8 +95,6 @@ }; } -#define END_OF_TEXT_LABEL_NAME "EndOfTextLabel" - } // End namespace llvm namespace ShaderType { Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.h =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -99,8 +99,6 @@ void EmitFunctionBodyStart() override; - void EmitEndOfAsmFile(Module &M) override; - void EmitFunctionEntryLabel() override; void EmitGlobalVariable(const GlobalVariable *GV) override; Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -100,16 +100,6 @@ } } -void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) { - - // This label is used to mark the end of the .text section. - const TargetLoweringObjectFile &TLOF = getObjFileLowering(); - OutStreamer->SwitchSection(TLOF.getTextSection()); - MCSymbol *EndOfTextLabel = - OutContext.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); - OutStreamer->EmitLabel(EndOfTextLabel); -} - void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { const SIMachineFunctionInfo *MFI = MF->getInfo(); const AMDGPUSubtarget &STM = MF->getSubtarget(); Index: lib/Target/AMDGPU/AMDGPUMCInstLower.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -73,13 +73,6 @@ MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(Sym, Ctx)); break; } - case MachineOperand::MO_TargetIndex: { - assert(MO.getIndex() == AMDGPU::TI_CONSTDATA_START); - MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); - const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx); - MCOp = MCOperand::createExpr(Expr); - break; - } case MachineOperand::MO_ExternalSymbol: { MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(MO.getSymbolName())); const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx); Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -14,7 +14,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPUTargetMachine.h" -#include "AMDGPUHSATargetObjectFile.h" +#include "AMDGPUTargetObjectFile.h" #include "AMDGPU.h" #include "AMDGPUTargetTransformInfo.h" #include "R600ISelLowering.h" @@ -50,6 +50,7 @@ initializeSIFixSGPRLiveRangesPass(*PR); initializeSIFixControlFlowLiveIntervalsPass(*PR); initializeSILoadStoreOptimizerPass(*PR); + initializeSIExpandPostRASchedPseudosPass(*PR); initializeAMDGPUAnnotateKernelFeaturesPass(*PR); } @@ -57,7 +58,7 @@ if (TT.getOS() == Triple::AMDHSA) return make_unique(); - return make_unique(); + return make_unique(); } static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { @@ -334,6 +335,7 @@ } void GCNPassConfig::addPreEmitPass() { + addPass(createSIExpandPostRASchedPseudos(), false); addPass(createSIInsertWaits(*TM), false); addPass(createSILowerControlFlowPass(*TM), false); } Index: lib/Target/AMDGPU/AMDGPUTargetObjectFile.h =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetObjectFile.h +++ lib/Target/AMDGPU/AMDGPUTargetObjectFile.h @@ -1,4 +1,4 @@ -//===-- AMDGPUHSATargetObjectFile.h - AMDGPU HSA Object Info ----*- C++ -*-===// +//===-- AMDGPUTargetObjectFile.h - AMDGPU Object Info ----*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -9,19 +9,26 @@ /// /// \file /// \brief This file declares the AMDGPU-specific subclass of -/// TargetLoweringObjectFile use for targeting the HSA-runtime. +/// TargetLoweringObjectFile. /// //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUHSATARGETOBJECTFILE_H -#define LLVM_LIB_TARGET_AMDGPU_AMDGPUHSATARGETOBJECTFILE_H +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/Target/TargetMachine.h" namespace llvm { -class AMDGPUHSATargetObjectFile final : public TargetLoweringObjectFileELF { +class AMDGPUTargetObjectFile : public TargetLoweringObjectFileELF { + public: + MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind, + Mangler &Mang, + const TargetMachine &TM) const override; +}; + +class AMDGPUHSATargetObjectFile final : public AMDGPUTargetObjectFile { private: MCSection *DataGlobalAgentSection; MCSection *DataGlobalProgramSection; Index: lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp +++ lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// -#include "AMDGPUHSATargetObjectFile.h" +#include "AMDGPUTargetObjectFile.h" #include "AMDGPU.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/MC/MCContext.h" @@ -16,6 +16,25 @@ using namespace llvm; +//===----------------------------------------------------------------------===// +// Generic Object File +//===----------------------------------------------------------------------===// + +MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV, + SectionKind Kind, + Mangler &Mang, + const TargetMachine &TM) const { + if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GV)) + return TextSection; + + return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang, TM); +} + +//===----------------------------------------------------------------------===// +// HSA Object File +//===----------------------------------------------------------------------===// + + void AMDGPUHSATargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM){ TargetLoweringObjectFileELF::Initialize(Ctx, TM); Index: lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp =================================================================== --- lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp +++ lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -99,14 +99,22 @@ case AMDGPU::fixup_si_rodata: { uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset()); - *Dst = Value; - break; - } - - case AMDGPU::fixup_si_end_of_text: { - uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset()); - // The value points to the last instruction in the text section, so we - // need to add 4 bytes to get to the start of the constants. + // We emit constant data at the end of the text section and generate its + // address using the following code sequence: + // s_getpc_b64 s[0:1] + // s_add_u32 s0, s0, $symbol + // s_addc_u32 s1, s1, 0 + // + // s_getpc_b64 returns the address of the s_add_u32 instruction and then + // the fixup replaces $symbol with a literal constant, which is a + // pc-relative offset from the encoding of the $symbol operand to the + // constant data. + // + // What we want here is an offset from the start of the s_add_u32 + // instruction to the constant data, but since the encoding of $symbol + // starts 4 bytes after the start of the add instruction, we end up + // with an offset that is 4 bytes to small. This requres us to + // add 4 to the fixup value before applying it. *Dst = Value + 4; break; } @@ -136,8 +144,7 @@ const static MCFixupKindInfo Infos[AMDGPU::NumTargetFixupKinds] = { // name offset bits flags { "fixup_si_sopp_br", 0, 16, MCFixupKindInfo::FKF_IsPCRel }, - { "fixup_si_rodata", 0, 32, 0 }, - { "fixup_si_end_of_text", 0, 32, MCFixupKindInfo::FKF_IsPCRel } + { "fixup_si_rodata", 0, 32, MCFixupKindInfo::FKF_IsPCRel } }; if (Kind < FirstTargetFixupKind) Index: lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h =================================================================== --- lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h +++ lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h @@ -21,9 +21,6 @@ /// fixup for global addresses with constant initializers fixup_si_rodata, - /// fixup for offset from instruction to end of text section - fixup_si_end_of_text, - // Marker LastTargetFixupKind, NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind Index: lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp =================================================================== --- lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -250,17 +250,7 @@ if (MO.isExpr()) { const MCSymbolRefExpr *Expr = cast(MO.getExpr()); - MCFixupKind Kind; - const MCSymbol *Sym = - Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); - - if (&Expr->getSymbol() == Sym) { - // Add the offset to the beginning of the constant values. - Kind = (MCFixupKind)AMDGPU::fixup_si_end_of_text; - } else { - // This is used for constant data stored in .rodata. - Kind = (MCFixupKind)AMDGPU::fixup_si_rodata; - } + MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_rodata; Fixups.push_back(MCFixup::create(4, Expr, Kind, MI.getLoc())); } Index: lib/Target/AMDGPU/SIExpandPostRASchedPseudos.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/SIExpandPostRASchedPseudos.cpp @@ -0,0 +1,106 @@ +//===-- SIExpandPostRASchedPseudos.cpp - Pseudo expansion pass -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass expands instructions after the post-RA machine scheduler. It is +/// necessary for pseudo instructions that are expanded to multiple instructions +/// that must be scheduled adjacent. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "SIInstrInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-expand-post-ra-sched-pseudos" + +namespace { +class SIExpandPostRASchedPseudos : public MachineFunctionPass { + +public: + static char ID; // Pass identification, replacement for typeid + SIExpandPostRASchedPseudos() : MachineFunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + /// runOnMachineFunction - pass entry point + bool runOnMachineFunction(MachineFunction&) override; + +}; +} // end anonymous namespace + +INITIALIZE_PASS_BEGIN(SIExpandPostRASchedPseudos, DEBUG_TYPE, + "SI expand post-ra scheduler pseudos", false, false) +INITIALIZE_PASS_END(SIExpandPostRASchedPseudos, DEBUG_TYPE, + "SI expand post-ra scheduler pseudos", false, false) + +char SIExpandPostRASchedPseudos::ID = 0; + +char &llvm::SIExpandPostRASchedPseudosID = SIExpandPostRASchedPseudos::ID; + +bool SIExpandPostRASchedPseudos::runOnMachineFunction(MachineFunction &MF) { + const AMDGPUSubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = + static_cast(ST.getRegisterInfo()); + const SIInstrInfo *TII = + static_cast(ST.getInstrInfo()); + + for (MachineFunction::iterator MBBI = MF.begin(), MBBE = MF.end(); + MBBI != MBBE; ++MBBI) { + + MachineBasicBlock &MBB = *MBBI; + for (MachineBasicBlock::iterator I = MBBI->begin(), E = MBBI->end(); + I != E;) { + MachineInstr *MI = I; + // Advance iterator here because MI may be erased. + ++I; + + // Only expand pseudos. + if (!MI->isPseudo()) + continue; + + DebugLoc DL = MI->getDebugLoc(); + switch(MI->getOpcode()) { + default: break; + case AMDGPU::SI_CONSTDATA_PTR: { + unsigned Reg = MI->getOperand(0).getReg(); + unsigned RegLo = TRI->getSubReg(Reg, AMDGPU::sub0); + unsigned RegHi = TRI->getSubReg(Reg, AMDGPU::sub1); + + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_GETPC_B64), Reg); + + // Add 32-bit offset from this instruction to the start of the constant data. + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), RegLo) + .addReg(RegLo) + .addOperand(MI->getOperand(1)); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADDC_U32), RegHi) + .addReg(RegHi) + .addImm(0); + MI->eraseFromParent(); + break; + } + } + } + } + + return false; +} + +FunctionPass *llvm::createSIExpandPostRASchedPseudos() { + return new SIExpandPostRASchedPseudos(); +} Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -1094,20 +1094,8 @@ const GlobalValue *GV = GSD->getGlobal(); MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace()); - SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT); SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32); - - SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, - DAG.getConstant(0, DL, MVT::i32)); - SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, - DAG.getConstant(1, DL, MVT::i32)); - - SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue), - PtrLo, GA); - SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue), - PtrHi, DAG.getConstant(0, DL, MVT::i32), - SDValue(Lo.getNode(), 1)); - return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi); + return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT, GA); } SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -762,26 +762,6 @@ switch (MI->getOpcode()) { default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); - case AMDGPU::SI_CONSTDATA_PTR: { - unsigned Reg = MI->getOperand(0).getReg(); - unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0); - unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1); - - BuildMI(MBB, MI, DL, get(AMDGPU::S_GETPC_B64), Reg); - - // Add 32-bit offset from this instruction to the start of the constant data. - BuildMI(MBB, MI, DL, get(AMDGPU::S_ADD_U32), RegLo) - .addReg(RegLo) - .addTargetIndex(AMDGPU::TI_CONSTDATA_START) - .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit); - BuildMI(MBB, MI, DL, get(AMDGPU::S_ADDC_U32), RegHi) - .addReg(RegHi) - .addImm(0) - .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit) - .addReg(AMDGPU::SCC, RegState::Implicit); - MI->eraseFromParent(); - break; - } case AMDGPU::SGPR_USE: // This is just a placeholder for register allocation. MI->eraseFromParent(); Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -133,7 +133,8 @@ def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">; def SIconstdata_ptr : SDNode< - "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 0, [SDTCisVT<0, i64>]> + "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, i64>, + SDTCisVT<0, i64>]> >; //===----------------------------------------------------------------------===// @@ -366,6 +367,8 @@ let ParserMatchClass = SoppBrTarget; } +def const_ga : Operand; + include "SIInstrFormats.td" include "VIInstrFormats.td" Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -2039,8 +2039,8 @@ def SI_CONSTDATA_PTR : InstSI < (outs SReg_64:$dst), - (ins), - "", [(set SReg_64:$dst, (i64 SIconstdata_ptr))] + (ins const_ga:$ptr), + "", [(set SReg_64:$dst, (i64 (SIconstdata_ptr (tglobaladdr:$ptr))))] > { let SALU = 1; } Index: test/CodeGen/AMDGPU/global-constant.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/global-constant.ll @@ -0,0 +1,25 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +@readonly = private unnamed_addr addrspace(2) constant [4 x float] [float 0.0, float 1.0, float 2.0, float 3.0] +@readonly2 = private unnamed_addr addrspace(2) constant [4 x float] [float 4.0, float 5.0, float 6.0, float 7.0] + +; GCN-LABEL: {{^}}main: +; GCN: s_getpc_b64 s{{\[}}[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]{{\]}} +; GCN-NEXT: s_add_u32 s{{[0-9]+}}, s[[PC0_LO]], readonly +; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC0_HI]], 0 +; GCN: s_getpc_b64 s{{\[}}[[PC1_LO:[0-9]+]]:[[PC1_HI:[0-9]+]]{{\]}} +; GCN-NEXT: s_add_u32 s{{[0-9]+}}, s[[PC1_LO]], readonly +; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC1_HI]], 0 +; GCN: .text +; GCN: readonly: +; GCN: readonly2: +define void @main(i32 %index, float addrspace(1)* %out) { + %ptr = getelementptr [4 x float], [4 x float] addrspace(2) * @readonly, i32 0, i32 %index + %val = load float, float addrspace(2)* %ptr + store float %val, float addrspace(1)* %out + %ptr2 = getelementptr [4 x float], [4 x float] addrspace(2) * @readonly2, i32 0, i32 %index + %val2 = load float, float addrspace(2)* %ptr2 + store float %val2, float addrspace(1)* %out + ret void +} +