Index: llvm/trunk/lib/Target/AMDGPU/AMDGPU.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPU.td +++ llvm/trunk/lib/Target/AMDGPU/AMDGPU.td @@ -562,5 +562,6 @@ include "AMDGPUInstrInfo.td" include "AMDGPUIntrinsics.td" include "AMDGPURegisterInfo.td" +include "AMDGPURegisterBanks.td" include "AMDGPUInstructions.td" include "AMDGPUCallingConv.td" Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUCallLowering.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUCallLowering.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUCallLowering.h @@ -22,6 +22,13 @@ class AMDGPUTargetLowering; class AMDGPUCallLowering: public CallLowering { + + unsigned lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy, + unsigned Offset) const; + + void lowerParameter(MachineIRBuilder &MIRBuilder, Type *ParamTy, + unsigned Offset, unsigned DstReg) const; + public: AMDGPUCallLowering(const AMDGPUTargetLowering &TLI); @@ -29,6 +36,7 @@ unsigned VReg) const override; bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef VRegs) const override; + CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const; }; } // End of namespace llvm; #endif Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUCallLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -14,8 +14,13 @@ //===----------------------------------------------------------------------===// #include "AMDGPUCallLowering.h" +#include "AMDGPU.h" #include "AMDGPUISelLowering.h" - +#include "AMDGPUSubtarget.h" +#include "SIISelLowering.h" +#include "SIRegisterInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -30,13 +35,135 @@ } bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, - const Value *Val, unsigned VReg) const { + const Value *Val, unsigned VReg) const { + MIRBuilder.buildInstr(AMDGPU::S_ENDPGM); return true; } +unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder, + Type *ParamTy, + unsigned Offset) const { + + MachineFunction &MF = MIRBuilder.getMF(); + const SIRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const Function &F = *MF.getFunction(); + const DataLayout &DL = F.getParent()->getDataLayout(); + PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS); + LLT PtrType(*PtrTy, DL); + unsigned DstReg = MRI.createGenericVirtualRegister(PtrType); + unsigned KernArgSegmentPtr = + TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); + unsigned KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr); + + unsigned OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); + MIRBuilder.buildConstant(OffsetReg, Offset); + + MIRBuilder.buildGEP(DstReg, KernArgSegmentVReg, OffsetReg); + + return DstReg; +} + +void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder, + Type *ParamTy, unsigned Offset, + unsigned DstReg) const { + MachineFunction &MF = MIRBuilder.getMF(); + const Function &F = *MF.getFunction(); + const DataLayout &DL = F.getParent()->getDataLayout(); + PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS); + MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); + unsigned TypeSize = DL.getTypeStoreSize(ParamTy); + unsigned Align = DL.getABITypeAlignment(ParamTy); + unsigned PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset); + + MachineMemOperand *MMO = + MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad | + MachineMemOperand::MONonTemporal | + MachineMemOperand::MOInvariant, + TypeSize, Align); + + MIRBuilder.buildLoad(DstReg, PtrReg, *MMO); +} + bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef VRegs) const { - // TODO: Implement once there are generic loads/stores. + + MachineFunction &MF = MIRBuilder.getMF(); + const SISubtarget *Subtarget = static_cast(&MF.getSubtarget()); + MachineRegisterInfo &MRI = MF.getRegInfo(); + SIMachineFunctionInfo *Info = MF.getInfo(); + const SIRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + const DataLayout &DL = F.getParent()->getDataLayout(); + + SmallVector ArgLocs; + CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); + + // FIXME: How should these inputs interact with inreg / custom SGPR inputs? + if (Info->hasPrivateSegmentBuffer()) { + unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI); + MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass); + CCInfo.AllocateReg(PrivateSegmentBufferReg); + } + + if (Info->hasDispatchPtr()) { + unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI); + // FIXME: Need to add reg as live-in + CCInfo.AllocateReg(DispatchPtrReg); + } + + if (Info->hasQueuePtr()) { + unsigned QueuePtrReg = Info->addQueuePtr(*TRI); + // FIXME: Need to add reg as live-in + CCInfo.AllocateReg(QueuePtrReg); + } + + if (Info->hasKernargSegmentPtr()) { + unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI); + const LLT P2 = LLT::pointer(2, 64); + unsigned VReg = MRI.createGenericVirtualRegister(P2); + MRI.addLiveIn(InputPtrReg, VReg); + MIRBuilder.getMBB().addLiveIn(InputPtrReg); + MIRBuilder.buildCopy(VReg, InputPtrReg); + CCInfo.AllocateReg(InputPtrReg); + } + + if (Info->hasDispatchID()) { + unsigned DispatchIDReg = Info->addDispatchID(*TRI); + // FIXME: Need to add reg as live-in + CCInfo.AllocateReg(DispatchIDReg); + } + + if (Info->hasFlatScratchInit()) { + unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI); + // FIXME: Need to add reg as live-in + CCInfo.AllocateReg(FlatScratchInitReg); + } + + unsigned NumArgs = F.arg_size(); + Function::const_arg_iterator CurOrigArg = F.arg_begin(); + const AMDGPUTargetLowering &TLI = *getTLI(); + for (unsigned i = 0; i != NumArgs; ++i, ++CurOrigArg) { + CurOrigArg->getType()->dump(); + MVT ValVT = TLI.getValueType(DL, CurOrigArg->getType()).getSimpleVT(); + ISD::ArgFlagsTy Flags; + Flags.setOrigAlign(DL.getABITypeAlignment(CurOrigArg->getType())); + CCAssignFn *AssignFn = CCAssignFnForCall(F.getCallingConv(), + /*IsVarArg=*/false); + bool Res = + AssignFn(i, ValVT, ValVT, CCValAssign::Full, Flags, CCInfo); + assert(!Res && "Call operand has unhandled type"); + (void)Res; + } + + Function::const_arg_iterator Arg = F.arg_begin(); + for (unsigned i = 0; i != NumArgs; ++i, ++Arg) { + // FIXME: We should be getting DebugInfo from the arguments some how. + CCValAssign &VA = ArgLocs[i]; + lowerParameter(MIRBuilder, Arg->getType(), + VA.getLocMemOffset() + + Subtarget->getExplicitKernelArgOffset(MF), VRegs[i]); + } + return true; } Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def @@ -0,0 +1,62 @@ +//===- AMDGPUGenRegisterBankInfo.def -----------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file defines all the static objects used by AMDGPURegisterBankInfo. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_BUILD_GLOBAL_ISEL +#error "You shouldn't build this" +#endif + +namespace llvm { +namespace AMDGPU { + +enum PartialMappingIdx { + None = - 1, + PM_SGPR32 = 0, + PM_SGPR64 = 1, + PM_VGPR32 = 2, + PM_VGPR64 = 3 +}; + +const RegisterBankInfo::PartialMapping PartMappings[] { + // StartIdx, Length, RegBank + {0, 32, SGPRRegBank}, + {0, 64, SGPRRegBank}, + {0, 32, VGPRRegBank}, + {0, 64, VGPRRegBank} +}; + +const RegisterBankInfo::ValueMapping ValMappings[] { + // SGPR 32-bit + {&PartMappings[0], 1}, + // SGPR 64-bit + {&PartMappings[1], 1}, + // VGPR 32-bit + {&PartMappings[2], 1}, + // VGPR 64-bit + {&PartMappings[3], 1} +}; + +enum ValueMappingIdx { + SGPRStartIdx = 0, + VGPRStartIdx = 2 +}; + +const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID, + unsigned Size) { + assert(Size % 32 == 0); + unsigned Idx = BankID == AMDGPU::SGPRRegBankID ? SGPRStartIdx : VGPRStartIdx; + Idx += (Size / 32) - 1; + return &ValMappings[Idx]; +} + +} // End AMDGPU namespace. +} // End llvm namespace. Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -15,6 +15,7 @@ #include "AMDGPUISelLowering.h" #include "AMDGPU.h" +#include "AMDGPUCallLowering.h" #include "AMDGPUFrameLowering.h" #include "AMDGPUIntrinsicInfo.h" #include "AMDGPURegisterInfo.h" @@ -670,6 +671,11 @@ // TargetLowering Callbacks //===---------------------------------------------------------------------===// +CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC, + bool IsVarArg) const { + return CC_AMDGPU; +} + /// The SelectionDAGBuilder will automatically promote function arguments /// with illegal types. However, this does not work for the AMDGPU targets /// since the function arguments are stored in memory as these illegal types. Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -0,0 +1,65 @@ +//===- AMDGPUInstructionSelector --------------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file declares the targeting of the InstructionSelector class for +/// AMDGPU. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRUCTIONSELECTOR_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRUCTIONSELECTOR_H + +#include "llvm/CodeGen/GlobalISel/InstructionSelector.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" + +namespace llvm { + +class AMDGPUInstrInfo; +class AMDGPURegisterBankInfo; +class MachineInstr; +class MachineOperand; +class MachineRegisterInfo; +class SIInstrInfo; +class SIRegisterInfo; +class SISubtarget; + +class AMDGPUInstructionSelector : public InstructionSelector { +public: + AMDGPUInstructionSelector(const SISubtarget &STI, + const AMDGPURegisterBankInfo &RBI); + + bool select(MachineInstr &I) const override; + +private: + struct GEPInfo { + const MachineInstr &GEP; + SmallVector SgprParts; + SmallVector VgprParts; + int64_t Imm; + GEPInfo(const MachineInstr &GEP) : GEP(GEP), Imm(0) { } + }; + + MachineOperand getSubOperand64(MachineOperand &MO, unsigned SubIdx) const; + bool selectG_CONSTANT(MachineInstr &I) const; + bool selectG_ADD(MachineInstr &I) const; + bool selectG_GEP(MachineInstr &I) const; + bool hasVgprParts(ArrayRef AddrInfo) const; + void getAddrModeInfo(const MachineInstr &Load, const MachineRegisterInfo &MRI, + SmallVectorImpl &AddrInfo) const; + bool selectSMRD(MachineInstr &I, ArrayRef AddrInfo) const; + bool selectG_LOAD(MachineInstr &I) const; + bool selectG_STORE(MachineInstr &I) const; + + const SIInstrInfo &TII; + const SIRegisterInfo &TRI; + const AMDGPURegisterBankInfo &RBI; +}; + +} // End llvm namespace. +#endif Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -0,0 +1,418 @@ +//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the targeting of the InstructionSelector class for +/// AMDGPU. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#include "AMDGPUInstructionSelector.h" +#include "AMDGPUInstrInfo.h" +#include "AMDGPURegisterBankInfo.h" +#include "AMDGPURegisterInfo.h" +#include "AMDGPUSubtarget.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Type.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "amdgpu-isel" + +using namespace llvm; + +AMDGPUInstructionSelector::AMDGPUInstructionSelector( + const SISubtarget &STI, const AMDGPURegisterBankInfo &RBI) + : InstructionSelector(), TII(*STI.getInstrInfo()), + TRI(*STI.getRegisterInfo()), RBI(RBI) {} + +MachineOperand +AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, + unsigned SubIdx) const { + + MachineInstr *MI = MO.getParent(); + MachineBasicBlock *BB = MO.getParent()->getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + + if (MO.isReg()) { + unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); + unsigned Reg = MO.getReg(); + BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) + .addReg(Reg, 0, ComposedSubIdx); + + return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(), + MO.isKill(), MO.isDead(), MO.isUndef(), + MO.isEarlyClobber(), 0, MO.isDebug(), + MO.isInternalRead()); + } + + assert(MO.isImm()); + + APInt Imm(64, MO.getImm()); + + switch (SubIdx) { + default: + llvm_unreachable("do not know to split immediate with this sub index."); + case AMDGPU::sub0: + return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue()); + case AMDGPU::sub1: + return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue()); + } +} + +bool AMDGPUInstructionSelector::selectG_ADD(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned Size = RBI.getSizeInBits(I.getOperand(0).getReg(), MRI, TRI); + unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + + if (Size != 64) + return false; + + DebugLoc DL = I.getDebugLoc(); + + BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) + .add(getSubOperand64(I.getOperand(1), AMDGPU::sub0)) + .add(getSubOperand64(I.getOperand(2), AMDGPU::sub0)); + + BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) + .add(getSubOperand64(I.getOperand(1), AMDGPU::sub1)) + .add(getSubOperand64(I.getOperand(2), AMDGPU::sub1)); + + BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), I.getOperand(0).getReg()) + .addReg(DstLo) + .addImm(AMDGPU::sub0) + .addReg(DstHi) + .addImm(AMDGPU::sub1); + + for (MachineOperand &MO : I.explicit_operands()) { + if (!MO.isReg() || TargetRegisterInfo::isPhysicalRegister(MO.getReg())) + continue; + RBI.constrainGenericRegister(MO.getReg(), AMDGPU::SReg_64RegClass, MRI); + } + + I.eraseFromParent(); + return true; +} + +bool AMDGPUInstructionSelector::selectG_GEP(MachineInstr &I) const { + return selectG_ADD(I); +} + +bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + DebugLoc DL = I.getDebugLoc(); + + // FIXME: Select store instruction based on address space + MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(AMDGPU::FLAT_STORE_DWORD)) + .add(I.getOperand(1)) + .add(I.getOperand(0)) + .addImm(0) + .addImm(0) + .addImm(0); + + // Now that we selected an opcode, we need to constrain the register + // operands to use appropriate classes. + bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI); + + I.eraseFromParent(); + return Ret; +} + +bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned DstReg = I.getOperand(0).getReg(); + unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI); + + if (Size == 32) { + I.setDesc(TII.get(AMDGPU::S_MOV_B32)); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + + assert(Size == 64); + + DebugLoc DL = I.getDebugLoc(); + unsigned LoReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + unsigned HiReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + const APInt &Imm = I.getOperand(1).getCImm()->getValue(); + + BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), LoReg) + .addImm(Imm.trunc(32).getZExtValue()); + + BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg) + .addImm(Imm.ashr(32).getZExtValue()); + + BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) + .addReg(LoReg) + .addImm(AMDGPU::sub0) + .addReg(HiReg) + .addImm(AMDGPU::sub1); + // We can't call constrainSelectedInstRegOperands here, because it doesn't + // work for target independent opcodes + I.eraseFromParent(); + return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, MRI); +} + +static bool isConstant(const MachineInstr &MI) { + return MI.getOpcode() == TargetOpcode::G_CONSTANT; +} + +void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, + const MachineRegisterInfo &MRI, SmallVectorImpl &AddrInfo) const { + + const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg()); + + assert(PtrMI); + + if (PtrMI->getOpcode() != TargetOpcode::G_GEP) + return; + + GEPInfo GEPInfo(*PtrMI); + + for (unsigned i = 1, e = 3; i < e; ++i) { + const MachineOperand &GEPOp = PtrMI->getOperand(i); + const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg()); + assert(OpDef); + if (isConstant(*OpDef)) { + // FIXME: Is it possible to have multiple Imm parts? Maybe if we + // are lacking other optimizations. + assert(GEPInfo.Imm == 0); + GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue(); + continue; + } + const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI); + if (OpBank->getID() == AMDGPU::SGPRRegBankID) + GEPInfo.SgprParts.push_back(GEPOp.getReg()); + else + GEPInfo.VgprParts.push_back(GEPOp.getReg()); + } + + AddrInfo.push_back(GEPInfo); + getAddrModeInfo(*PtrMI, MRI, AddrInfo); +} + +static bool isInstrUniform(const MachineInstr &MI) { + if (!MI.hasOneMemOperand()) + return false; + + const MachineMemOperand *MMO = *MI.memoperands_begin(); + const Value *Ptr = MMO->getValue(); + + // UndefValue means this is a load of a kernel input. These are uniform. + // Sometimes LDS instructions have constant pointers. + // If Ptr is null, then that means this mem operand contains a + // PseudoSourceValue like GOT. + if (!Ptr || isa(Ptr) || isa(Ptr) || + isa(Ptr) || isa(Ptr)) + return true; + + const Instruction *I = dyn_cast(Ptr); + return I && I->getMetadata("amdgpu.uniform"); +} + +static unsigned getSmrdOpcode(unsigned BaseOpcode, unsigned LoadSize) { + + if (LoadSize == 32) + return BaseOpcode; + + switch (BaseOpcode) { + case AMDGPU::S_LOAD_DWORD_IMM: + switch (LoadSize) { + case 64: + return AMDGPU::S_LOAD_DWORDX2_IMM; + case 128: + return AMDGPU::S_LOAD_DWORDX4_IMM; + case 256: + return AMDGPU::S_LOAD_DWORDX8_IMM; + case 512: + return AMDGPU::S_LOAD_DWORDX16_IMM; + } + break; + case AMDGPU::S_LOAD_DWORD_IMM_ci: + switch (LoadSize) { + case 64: + return AMDGPU::S_LOAD_DWORDX2_IMM_ci; + case 128: + return AMDGPU::S_LOAD_DWORDX4_IMM_ci; + case 256: + return AMDGPU::S_LOAD_DWORDX8_IMM_ci; + case 512: + return AMDGPU::S_LOAD_DWORDX16_IMM_ci; + } + break; + case AMDGPU::S_LOAD_DWORD_SGPR: + switch (LoadSize) { + case 64: + return AMDGPU::S_LOAD_DWORDX2_SGPR; + case 128: + return AMDGPU::S_LOAD_DWORDX4_SGPR; + case 256: + return AMDGPU::S_LOAD_DWORDX8_SGPR; + case 512: + return AMDGPU::S_LOAD_DWORDX16_SGPR; + } + break; + } + llvm_unreachable("Invalid base smrd opcode or size"); +} + +bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef AddrInfo) const { + for (const GEPInfo &GEPInfo : AddrInfo) { + if (!GEPInfo.VgprParts.empty()) + return true; + } + return false; +} + +bool AMDGPUInstructionSelector::selectSMRD(MachineInstr &I, + ArrayRef AddrInfo) const { + + if (!I.hasOneMemOperand()) + return false; + + if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUAS::CONSTANT_ADDRESS) + return false; + + if (!isInstrUniform(I)) + return false; + + if (hasVgprParts(AddrInfo)) + return false; + + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + const SISubtarget &Subtarget = MF->getSubtarget(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned DstReg = I.getOperand(0).getReg(); + const DebugLoc &DL = I.getDebugLoc(); + unsigned Opcode; + unsigned LoadSize = RBI.getSizeInBits(DstReg, MRI, TRI); + + if (!AddrInfo.empty() && AddrInfo[0].SgprParts.size() == 1) { + + const GEPInfo &GEPInfo = AddrInfo[0]; + + unsigned PtrReg = GEPInfo.SgprParts[0]; + int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(Subtarget, GEPInfo.Imm); + if (AMDGPU::isLegalSMRDImmOffset(Subtarget, GEPInfo.Imm)) { + Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM, LoadSize); + + MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) + .addReg(PtrReg) + .addImm(EncodedImm) + .addImm(0); // glc + return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI); + } + + if (Subtarget.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS && + isUInt<32>(EncodedImm)) { + Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM_ci, LoadSize); + MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) + .addReg(PtrReg) + .addImm(EncodedImm) + .addImm(0); // glc + return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI); + } + + if (isUInt<32>(GEPInfo.Imm)) { + Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_SGPR, LoadSize); + unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), OffsetReg) + .addImm(GEPInfo.Imm); + + MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) + .addReg(PtrReg) + .addReg(OffsetReg) + .addImm(0); // glc + return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI); + } + } + + unsigned PtrReg = I.getOperand(1).getReg(); + Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM, LoadSize); + MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) + .addReg(PtrReg) + .addImm(0) + .addImm(0); // glc + return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI); +} + + +bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + DebugLoc DL = I.getDebugLoc(); + unsigned DstReg = I.getOperand(0).getReg(); + unsigned PtrReg = I.getOperand(1).getReg(); + unsigned LoadSize = RBI.getSizeInBits(DstReg, MRI, TRI); + unsigned Opcode; + + SmallVector AddrInfo; + + getAddrModeInfo(I, MRI, AddrInfo); + + if (selectSMRD(I, AddrInfo)) { + I.eraseFromParent(); + return true; + } + + switch (LoadSize) { + default: + llvm_unreachable("Load size not supported\n"); + case 32: + Opcode = AMDGPU::FLAT_LOAD_DWORD; + break; + case 64: + Opcode = AMDGPU::FLAT_LOAD_DWORDX2; + break; + } + + MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(Opcode)) + .add(I.getOperand(0)) + .addReg(PtrReg) + .addImm(0) + .addImm(0) + .addImm(0); + + bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI); + I.eraseFromParent(); + return Ret; +} + +bool AMDGPUInstructionSelector::select(MachineInstr &I) const { + + if (!isPreISelGenericOpcode(I.getOpcode())) + return true; + + switch (I.getOpcode()) { + default: + break; + case TargetOpcode::G_ADD: + return selectG_ADD(I); + case TargetOpcode::G_CONSTANT: + return selectG_CONSTANT(I); + case TargetOpcode::G_GEP: + return selectG_GEP(I); + case TargetOpcode::G_LOAD: + return selectG_LOAD(I); + case TargetOpcode::G_STORE: + return selectG_STORE(I); + } + return false; +} Index: llvm/trunk/lib/Target/AMDGPU/AMDGPULegalizerInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -0,0 +1,30 @@ +//===- AMDGPULegalizerInfo ---------------------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file declares the targeting of the Machinelegalizer class for +/// AMDGPU. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINELEGALIZER_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINELEGALIZER_H + +#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" + +namespace llvm { + +class LLVMContext; + +/// This class provides the information for the target register banks. +class AMDGPULegalizerInfo : public LegalizerInfo { +public: + AMDGPULegalizerInfo(); +}; +} // End llvm namespace. +#endif Index: llvm/trunk/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -0,0 +1,62 @@ +//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the targeting of the Machinelegalizer class for +/// AMDGPU. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#include "AMDGPULegalizerInfo.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/Target/TargetOpcodes.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#ifndef LLVM_BUILD_GLOBAL_ISEL +#error "You shouldn't build this" +#endif + +AMDGPULegalizerInfo::AMDGPULegalizerInfo() { + using namespace TargetOpcode; + + const LLT S32 = LLT::scalar(32); + const LLT S64 = LLT::scalar(64); + const LLT P1 = LLT::pointer(1, 64); + const LLT P2 = LLT::pointer(2, 64); + + setAction({G_CONSTANT, S64}, Legal); + + setAction({G_GEP, P1}, Legal); + setAction({G_GEP, P2}, Legal); + setAction({G_GEP, 1, S64}, Legal); + + setAction({G_LOAD, P1}, Legal); + setAction({G_LOAD, P2}, Legal); + setAction({G_LOAD, S32}, Legal); + setAction({G_LOAD, 1, P1}, Legal); + setAction({G_LOAD, 1, P2}, Legal); + + setAction({G_STORE, S32}, Legal); + setAction({G_STORE, 1, P1}, Legal); + + // FIXME: When RegBankSelect inserts copies, it will only create new + // registers with scalar types. This means we can end up with + // G_LOAD/G_STORE/G_GEP instruction with scalar types for their pointer + // operands. In assert builds, the instruction selector will assert + // if it sees a generic instruction which isn't legal, so we need to + // tell it that scalar types are legal for pointer operands + setAction({G_GEP, S64}, Legal); + setAction({G_LOAD, 1, S64}, Legal); + setAction({G_STORE, 1, S64}, Legal); + + computeTables(); +} Index: llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -0,0 +1,65 @@ +//===- AMDGPURegisterBankInfo -----------------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file declares the targeting of the RegisterBankInfo class for AMDGPU. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H + +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" + +namespace llvm { + +class SIRegisterInfo; +class TargetRegisterInfo; + +namespace AMDGPU { +enum { + SGPRRegBankID = 0, + VGPRRegBankID = 1, + NumRegisterBanks +}; +} // End AMDGPU namespace. + +/// This class provides the information for the target register banks. +class AMDGPUGenRegisterBankInfo : public RegisterBankInfo { + +protected: + +#define GET_TARGET_REGBANK_CLASS +#include "AMDGPUGenRegisterBank.inc" + +}; +class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo { + const SIRegisterInfo *TRI; + + /// See RegisterBankInfo::applyMapping. + void applyMappingImpl(const OperandsMapper &OpdMapper) const override; + + RegisterBankInfo::InstructionMapping + getInstrMappingForLoad(const MachineInstr &MI) const; + +public: + AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI); + + unsigned copyCost(const RegisterBank &A, const RegisterBank &B, + unsigned Size) const override; + + const RegisterBank & + getRegBankFromRegClass(const TargetRegisterClass &RC) const override; + + InstructionMappings + getInstrAlternativeMappings(const MachineInstr &MI) const override; + + InstructionMapping getInstrMapping(const MachineInstr &MI) const override; +}; +} // End llvm namespace. +#endif Index: llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -0,0 +1,228 @@ +//===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the targeting of the RegisterBankInfo class for +/// AMDGPU. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#include "AMDGPURegisterBankInfo.h" +#include "AMDGPUInstrInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/GlobalISel/RegisterBank.h" +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +#define GET_TARGET_REGBANK_IMPL +#include "AMDGPUGenRegisterBank.inc" + +// This file will be TableGen'ed at some point. +#include "AMDGPUGenRegisterBankInfo.def" + +using namespace llvm; + +#ifndef LLVM_BUILD_GLOBAL_ISEL +#error "You shouldn't build this" +#endif + +AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI) + : AMDGPUGenRegisterBankInfo(), + TRI(static_cast(&TRI)) { + + // HACK: Until this is fully tablegen'd + static bool AlreadyInit = false; + if (AlreadyInit) + return; + + AlreadyInit = true; + + const RegisterBank &RBSGPR = getRegBank(AMDGPU::SGPRRegBankID); + assert(&RBSGPR == &AMDGPU::SGPRRegBank); + + const RegisterBank &RBVGPR = getRegBank(AMDGPU::VGPRRegBankID); + assert(&RBVGPR == &AMDGPU::VGPRRegBank); + +} + +unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &A, + const RegisterBank &B, + unsigned Size) const { + return RegisterBankInfo::copyCost(A, B, Size); +} + +const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass( + const TargetRegisterClass &RC) const { + + if (TRI->isSGPRClass(&RC)) + return getRegBank(AMDGPU::SGPRRegBankID); + + return getRegBank(AMDGPU::VGPRRegBankID); +} + +RegisterBankInfo::InstructionMappings +AMDGPURegisterBankInfo::getInstrAlternativeMappings( + const MachineInstr &MI) const { + + const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + + InstructionMappings AltMappings; + switch (MI.getOpcode()) { + case TargetOpcode::G_LOAD: { + // FIXME: Should we be hard coding the size for these mappings? + InstructionMapping SSMapping(1, 1, + getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}), + 2); // Num Operands + AltMappings.emplace_back(std::move(SSMapping)); + + InstructionMapping VVMapping(2, 1, + getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64)}), + 2); // Num Operands + AltMappings.emplace_back(std::move(VVMapping)); + + // FIXME: Should this be the pointer-size (64-bits) or the size of the + // register that will hold the bufffer resourc (128-bits). + InstructionMapping VSMapping(3, 1, + getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}), + 2); // Num Operands + AltMappings.emplace_back(std::move(VSMapping)); + + return AltMappings; + + } + default: + break; + } + return RegisterBankInfo::getInstrAlternativeMappings(MI); +} + +void AMDGPURegisterBankInfo::applyMappingImpl( + const OperandsMapper &OpdMapper) const { + return applyDefaultMapping(OpdMapper); +} + +static bool isInstrUniform(const MachineInstr &MI) { + if (!MI.hasOneMemOperand()) + return false; + + const MachineMemOperand *MMO = *MI.memoperands_begin(); + return AMDGPU::isUniformMMO(MMO); +} + +RegisterBankInfo::InstructionMapping +AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { + + const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + RegisterBankInfo::InstructionMapping Mapping = + InstructionMapping{1, 1, nullptr, MI.getNumOperands()}; + SmallVector OpdsMapping(MI.getNumOperands()); + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + unsigned PtrSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); + + const ValueMapping *ValMapping; + const ValueMapping *PtrMapping; + + if (isInstrUniform(MI)) { + // We have a uniform instruction so we want to use an SMRD load + ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); + } else { + ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + // FIXME: What would happen if we used SGPRRegBankID here? + PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); + } + + OpdsMapping[0] = ValMapping; + OpdsMapping[1] = PtrMapping; + Mapping.setOperandsMapping(getOperandsMapping(OpdsMapping)); + return Mapping; + + // FIXME: Do we want to add a mapping for FLAT load, or should we just + // handle that during instruction selection? +} + +RegisterBankInfo::InstructionMapping +AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { + RegisterBankInfo::InstructionMapping Mapping = getInstrMappingImpl(MI); + + if (Mapping.isValid()) + return Mapping; + + const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + Mapping = InstructionMapping{1, 1, nullptr, MI.getNumOperands()}; + SmallVector OpdsMapping(MI.getNumOperands()); + + switch (MI.getOpcode()) { + default: break; + case AMDGPU::G_CONSTANT: { + unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + Mapping.setOperandsMapping(getOperandsMapping(OpdsMapping)); + return Mapping; + } + case AMDGPU::G_GEP: { + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + if (!MI.getOperand(i).isReg()) + continue; + + unsigned Size = MRI.getType(MI.getOperand(i).getReg()).getSizeInBits(); + OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + } + Mapping.setOperandsMapping(getOperandsMapping(OpdsMapping)); + return Mapping; + } + case AMDGPU::G_STORE: { + assert(MI.getOperand(0).isReg()); + unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + // FIXME: We need to specify a different reg bank once scalar stores + // are supported. + const ValueMapping *ValMapping = + AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + // FIXME: Depending on the type of store, the pointer could be in + // the SGPR Reg bank. + // FIXME: Pointer size should be based on the address space. + const ValueMapping *PtrMapping = + AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64); + + OpdsMapping[0] = ValMapping; + OpdsMapping[1] = PtrMapping; + Mapping.setOperandsMapping(getOperandsMapping(OpdsMapping)); + return Mapping; + } + + case AMDGPU::G_LOAD: + return getInstrMappingForLoad(MI); + } + + unsigned BankID = AMDGPU::SGPRRegBankID; + + Mapping = InstructionMapping{1, 1, nullptr, MI.getNumOperands()}; + unsigned Size = 0; + for (unsigned Idx = 0; Idx < MI.getNumOperands(); ++Idx) { + // If the operand is not a register default to the size of the previous + // operand. + // FIXME: Can't we pull the types from the MachineInstr rather than the + // operands. + if (MI.getOperand(Idx).isReg()) + Size = getSizeInBits(MI.getOperand(Idx).getReg(), MRI, *TRI); + OpdsMapping.push_back(AMDGPU::getValueMapping(BankID, Size)); + } + Mapping.setOperandsMapping(getOperandsMapping(OpdsMapping)); + + return Mapping; +} Index: llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBanks.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBanks.td +++ llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBanks.td @@ -0,0 +1,16 @@ +//=- AMDGPURegisterBank.td - Describe the AMDGPU Banks -------*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +def SGPRRegBank : RegisterBank<"SGPR", + [SReg_32, SReg_64, SReg_128, SReg_256, SReg_512] +>; + +def VGPRRegBank : RegisterBank<"VGPR", + [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512] +>; Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -517,6 +517,21 @@ return GISel->getCallLowering(); } + const InstructionSelector *getInstructionSelector() const override { + assert(GISel && "Access to GlobalISel APIs not set"); + return GISel->getInstructionSelector(); + } + + const LegalizerInfo *getLegalizerInfo() const { + assert(GISel && "Access to GlobalISel APIs not set"); + return GISel->getLegalizerInfo(); + } + + const RegisterBankInfo *getRegBankInfo() const override { + assert(GISel && "Access to GlobalISel APIs not set"); + return GISel->getRegBankInfo(); + } + const SIRegisterInfo *getRegisterInfo() const override { return &InstrInfo.getRegisterInfo(); } Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -16,18 +16,18 @@ #include "AMDGPUTargetMachine.h" #include "AMDGPU.h" #include "AMDGPUCallLowering.h" +#include "AMDGPUInstructionSelector.h" +#include "AMDGPULegalizerInfo.h" +#include "AMDGPURegisterBankInfo.h" #include "AMDGPUTargetObjectFile.h" #include "AMDGPUTargetTransformInfo.h" #include "GCNSchedStrategy.h" #include "R600MachineScheduler.h" #include "SIMachineScheduler.h" -#include "llvm/ADT/SmallString.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/ADT/Triple.h" -#include "llvm/CodeGen/GlobalISel/GISelAccessor.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelect.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h" -#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/GlobalISel/Legalizer.h" +#include "llvm/CodeGen/GlobalISel/RegBankSelect.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/Support/TargetRegistry.h" @@ -256,9 +256,21 @@ struct SIGISelActualAccessor : public GISelAccessor { std::unique_ptr CallLoweringInfo; + std::unique_ptr InstSelector; + std::unique_ptr Legalizer; + std::unique_ptr RegBankInfo; const AMDGPUCallLowering *getCallLowering() const override { return CallLoweringInfo.get(); } + const InstructionSelector *getInstructionSelector() const override { + return InstSelector.get(); + } + const LegalizerInfo *getLegalizerInfo() const override { + return Legalizer.get(); + } + const RegisterBankInfo *getRegBankInfo() const override { + return RegBankInfo.get(); + } }; } // end anonymous namespace @@ -292,6 +304,11 @@ SIGISelActualAccessor *GISel = new SIGISelActualAccessor(); GISel->CallLoweringInfo.reset( new AMDGPUCallLowering(*I->getTargetLowering())); + GISel->Legalizer.reset(new AMDGPULegalizerInfo()); + + GISel->RegBankInfo.reset(new AMDGPURegisterBankInfo(*I->getRegisterInfo())); + GISel->InstSelector.reset(new AMDGPUInstructionSelector(*I, + *static_cast(GISel->RegBankInfo.get()))); #endif I->setGISelAccessor(*GISel); @@ -592,16 +609,20 @@ } bool GCNPassConfig::addLegalizeMachineIR() { + addPass(new Legalizer()); return false; } bool GCNPassConfig::addRegBankSelect() { + addPass(new RegBankSelect()); return false; } bool GCNPassConfig::addGlobalInstructionSelect() { + addPass(new InstructionSelect()); return false; } + #endif void GCNPassConfig::addPreRegAlloc() { Index: llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt +++ llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt @@ -12,11 +12,17 @@ tablegen(LLVM AMDGPUGenAsmMatcher.inc -gen-asm-matcher) tablegen(LLVM AMDGPUGenDisassemblerTables.inc -gen-disassembler) tablegen(LLVM AMDGPUGenMCPseudoLowering.inc -gen-pseudo-lowering) +if(LLVM_BUILD_GLOBAL_ISEL) + tablegen(LLVM AMDGPUGenRegisterBank.inc -gen-register-bank) +endif() add_public_tablegen_target(AMDGPUCommonTableGen) # List of all GlobalISel files. set(GLOBAL_ISEL_FILES AMDGPUCallLowering.cpp + AMDGPUInstructionSelector.cpp + AMDGPULegalizerInfo.cpp + AMDGPURegisterBankInfo.cpp ) # Add GlobalISel files to the dependencies if the user wants to build it. Index: llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -10,10 +10,10 @@ #include "AMDGPU.h" #include "SIDefines.h" #include "llvm/CodeGen/MachineMemOperand.h" -#include "llvm/IR/LLVMContext.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" Index: llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir +++ llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir @@ -0,0 +1,25 @@ +# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefixes=GCN +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefixes=GCN + +--- | + define void @global_addrspace(i32 addrspace(1)* %global0) { ret void } +... +--- + +name: global_addrspace +legalized: true +regBankSelected: true + +# GCN: global_addrspace +# GCN: [[PTR:%[0-9]+]] = COPY %vgpr0_vgpr1 +# GCN: FLAT_LOAD_DWORD [[PTR]], 0, 0, 0 + +body: | + bb.0: + liveins: %vgpr0_vgpr1 + + %0:vgpr(p1) = COPY %vgpr0_vgpr1 + %1:vgpr(s32) = G_LOAD %0 :: (load 4 from %ir.global0) + +... +--- Index: llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir +++ llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir @@ -0,0 +1,141 @@ +# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefixes=GCN,SI,SICI,SIVI +# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefixes=GCN,CI,SICI +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefixes=GCN,VI,SIVI + +--- | + define void @smrd_imm(i32 addrspace(2)* %const0) { ret void } +... +--- + +name: smrd_imm +legalized: true +regBankSelected: true + +# GCN: body: +# GCN: [[PTR:%[0-9]+]] = COPY %sgpr0_sgpr1 + +# Immediate offset: +# SICI: S_LOAD_DWORD_IMM [[PTR]], 1, 0 +# VI: S_LOAD_DWORD_IMM [[PTR]], 4, 0 + +# Max immediate offset for SI +# SICI: S_LOAD_DWORD_IMM [[PTR]], 255, 0 +# VI: S_LOAD_DWORD_IMM [[PTR]], 1020, 0 + +# Immediate overflow for SI +# FIXME: The immediate gets selected twice, once into the +# S_LOAD_DWORD instruction and once just as a normal constat. +# SI: S_MOV_B32 1024 +# SI: [[K1024:%[0-9]+]] = S_MOV_B32 1024 +# SI: S_LOAD_DWORD_SGPR [[PTR]], [[K1024]], 0 +# CI: S_LOAD_DWORD_IMM_ci [[PTR]], 256, 0 +# VI: S_LOAD_DWORD_IMM [[PTR]], 1024, 0 + +# Max immediate offset for VI +# SI: S_MOV_B32 1048572 +# SI: [[K1048572:%[0-9]+]] = S_MOV_B32 1048572 +# CI: S_LOAD_DWORD_IMM_ci [[PTR]], 262143 +# VI: S_LOAD_DWORD_IMM [[PTR]], 1048572 + +# +# Immediate overflow for VI +# FIXME: The immediate gets selected twice, once into the +# S_LOAD_DWORD instruction and once just as a normal constat. +# SIVI: S_MOV_B32 1048576 +# SIVI: [[K1048576:%[0-9]+]] = S_MOV_B32 1048576 +# SIVI: S_LOAD_DWORD_SGPR [[PTR]], [[K1048576]], 0 +# CI: S_LOAD_DWORD_IMM_ci [[PTR]], 262144, 0 + +# Max immediate for CI +# SIVI: [[K_LO:%[0-9]+]] = S_MOV_B32 4294967292 +# SIVI: [[K_HI:%[0-9]+]] = S_MOV_B32 3 +# SIVI: [[K:%[0-9]+]] = REG_SEQUENCE [[K_LO]], 1, [[K_HI]], 2 +# SIVI: [[K_SUB0:%[0-9]+]] = COPY [[K]].sub0 +# SIVI: [[PTR_LO:%[0-9]+]] = COPY [[PTR]].sub0 +# SIVI: [[ADD_PTR_LO:%[0-9]+]] = S_ADD_U32 [[PTR_LO]], [[K_SUB0]] +# SIVI: [[K_SUB1:%[0-9]+]] = COPY [[K]].sub1 +# SIVI: [[PTR_HI:%[0-9]+]] = COPY [[PTR]].sub1 +# SIVI: [[ADD_PTR_HI:%[0-9]+]] = S_ADDC_U32 [[PTR_HI]], [[K_SUB1]] +# SIVI: [[ADD_PTR:%[0-9]+]] = REG_SEQUENCE [[ADD_PTR_LO]], 1, [[ADD_PTR_HI]], 2 +# SIVI: S_LOAD_DWORD_IMM [[ADD_PTR]], 0, 0 +# CI: S_LOAD_DWORD_IMM_ci [[PTR]], 4294967295, 0 + +# Immediate overflow for CI +# GCN: [[K_LO:%[0-9]+]] = S_MOV_B32 0 +# GCN: [[K_HI:%[0-9]+]] = S_MOV_B32 4 +# GCN: [[K:%[0-9]+]] = REG_SEQUENCE [[K_LO]], 1, [[K_HI]], 2 +# GCN: [[K_SUB0:%[0-9]+]] = COPY [[K]].sub0 +# GCN: [[PTR_LO:%[0-9]+]] = COPY [[PTR]].sub0 +# GCN: [[ADD_PTR_LO:%[0-9]+]] = S_ADD_U32 [[PTR_LO]], [[K_SUB0]] +# GCN: [[K_SUB1:%[0-9]+]] = COPY [[K]].sub1 +# GCN: [[PTR_HI:%[0-9]+]] = COPY [[PTR]].sub1 +# GCN: [[ADD_PTR_HI:%[0-9]+]] = S_ADDC_U32 [[PTR_HI]], [[K_SUB1]] +# GCN: [[ADD_PTR:%[0-9]+]] = REG_SEQUENCE [[ADD_PTR_LO]], 1, [[ADD_PTR_HI]], 2 +# GCN: S_LOAD_DWORD_IMM [[ADD_PTR]], 0, 0 + +# Max 32-bit byte offset +# FIXME: The immediate gets selected twice, once into the +# S_LOAD_DWORD instruction and once just as a normal constat. +# SIVI: S_MOV_B32 4294967292 +# SIVI: [[K4294967292:%[0-9]+]] = S_MOV_B32 4294967292 +# SIVI: S_LOAD_DWORD_SGPR [[PTR]], [[K4294967292]], 0 +# CI: S_LOAD_DWORD_IMM_ci [[PTR]], 1073741823, 0 + +# Overflow 32-bit byte offset +# SIVI: [[K_LO:%[0-9]+]] = S_MOV_B32 0 +# SIVI: [[K_HI:%[0-9]+]] = S_MOV_B32 1 +# SIVI: [[K:%[0-9]+]] = REG_SEQUENCE [[K_LO]], 1, [[K_HI]], 2 +# SIVI: [[K_SUB0:%[0-9]+]] = COPY [[K]].sub0 +# SIVI: [[PTR_LO:%[0-9]+]] = COPY [[PTR]].sub0 +# SIVI: [[ADD_PTR_LO:%[0-9]+]] = S_ADD_U32 [[PTR_LO]], [[K_SUB0]] +# SIVI: [[K_SUB1:%[0-9]+]] = COPY [[K]].sub1 +# SIVI: [[PTR_HI:%[0-9]+]] = COPY [[PTR]].sub1 +# SIVI: [[ADD_PTR_HI:%[0-9]+]] = S_ADDC_U32 [[PTR_HI]], [[K_SUB1]] +# SIVI: [[ADD_PTR:%[0-9]+]] = REG_SEQUENCE [[ADD_PTR_LO]], 1, [[ADD_PTR_HI]], 2 +# SIVI: S_LOAD_DWORD_IMM [[ADD_PTR]], 0, 0 +# CI: S_LOAD_DWORD_IMM_ci [[PTR]], 1073741824, 0 + +body: | + bb.0: + liveins: %sgpr0_sgpr1 + + %0:sgpr(p2) = COPY %sgpr0_sgpr1 + + %1:sgpr(s64) = G_CONSTANT i64 4 + %2:sgpr(p2) = G_GEP %0, %1 + %3:sgpr(s32) = G_LOAD %2 :: (load 4 from %ir.const0) + + %4:sgpr(s64) = G_CONSTANT i64 1020 + %5:sgpr(p2) = G_GEP %0, %4 + %6:sgpr(s32) = G_LOAD %5 :: (load 4 from %ir.const0) + + %7:sgpr(s64) = G_CONSTANT i64 1024 + %8:sgpr(p2) = G_GEP %0, %7 + %9:sgpr(s32) = G_LOAD %8 :: (load 4 from %ir.const0) + + %10:sgpr(s64) = G_CONSTANT i64 1048572 + %11:sgpr(p2) = G_GEP %0, %10 + %12:sgpr(s32) = G_LOAD %11 :: (load 4 from %ir.const0) + + %13:sgpr(s64) = G_CONSTANT i64 1048576 + %14:sgpr(p2) = G_GEP %0, %13 + %15:sgpr(s32) = G_LOAD %14 :: (load 4 from %ir.const0) + + %16:sgpr(s64) = G_CONSTANT i64 17179869180 + %17:sgpr(p2) = G_GEP %0, %16 + %18:sgpr(s32) = G_LOAD %17 :: (load 4 from %ir.const0) + + %19:sgpr(s64) = G_CONSTANT i64 17179869184 + %20:sgpr(p2) = G_GEP %0, %19 + %21:sgpr(s32) = G_LOAD %20 :: (load 4 from %ir.const0) + + %22:sgpr(s64) = G_CONSTANT i64 4294967292 + %23:sgpr(p2) = G_GEP %0, %22 + %24:sgpr(s32) = G_LOAD %23 :: (load 4 from %ir.const0) + + %25:sgpr(s64) = G_CONSTANT i64 4294967296 + %26:sgpr(p2) = G_GEP %0, %25 + %27:sgpr(s32) = G_LOAD %26 :: (load 4 from %ir.const0) + +... +--- Index: llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir +++ llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir @@ -0,0 +1,27 @@ +# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefixes=GCN +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefixes=GCN + +--- | + define void @global_addrspace(i32 addrspace(1)* %global0) { ret void } +... +--- + +name: global_addrspace +legalized: true +regBankSelected: true + +# GCN: global_addrspace +# GCN: [[PTR:%[0-9]+]] = COPY %vgpr0_vgpr1 +# GCN: [[VAL:%[0-9]+]] = COPY %vgpr2 +# GCN: FLAT_STORE_DWORD [[PTR]], [[VAL]], 0, 0, 0 + +body: | + bb.0: + liveins: %vgpr0_vgpr1, %vgpr2 + + %0:vgpr(p1) = COPY %vgpr0_vgpr1 + %1:vgpr(s32) = COPY %vgpr2 + G_STORE %1, %0 :: (store 4 into %ir.global0) + +... +--- Index: llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir +++ llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir @@ -0,0 +1,67 @@ +# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=regbankselect -global-isel %s -verify-machineinstrs -o - | FileCheck %s + +--- | + define void @load_constant(i32 addrspace(2)* %ptr0) { ret void } + define void @load_global_uniform(i32 addrspace(1)* %ptr1) { + %tmp0 = load i32, i32 addrspace(1)* %ptr1 + ret void + } + define void @load_global_non_uniform(i32 addrspace(1)* %ptr2) { + %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0 + %tmp1 = getelementptr i32, i32 addrspace(1)* %ptr2, i32 %tmp0 + %tmp2 = load i32, i32 addrspace(1)* %tmp1 + ret void + } + declare i32 @llvm.amdgcn.workitem.id.x() #0 + attributes #0 = { nounwind readnone } +... + +--- +name : load_constant +legalized: true + +# CHECK-LABEL: name: load_constant +# CHECK: registers: +# CHECK: - { id: 0, class: sgpr } +# CHECK: - { id: 1, class: sgpr } + +body: | + bb.0: + liveins: %sgpr0_sgpr1 + %0:_(p2) = COPY %sgpr0_sgpr1 + %1:_(s32) = G_LOAD %0 :: (load 4 from %ir.ptr0) +... + +--- +name: load_global_uniform +legalized: true + +# CHECK-LABEL: name: load_global_uniform +# CHECK: registers: +# CHECK: - { id: 0, class: sgpr } +# CHECK: - { id: 1, class: sgpr } + +body: | + bb.0: + liveins: %sgpr0_sgpr1 + %0:_(p1) = COPY %sgpr0_sgpr1 + %1:_(s32) = G_LOAD %0 :: (load 4 from %ir.ptr1) +... + +--- +name: load_global_non_uniform +legalized: true + +# CHECK-LABEL: name: load_global_non_uniform +# CHECK: registers: +# CHECK: - { id: 0, class: sgpr } +# CHECK: - { id: 1, class: vgpr } +# CHECK: - { id: 2, class: vgpr } + + +body: | + bb.0: + liveins: %sgpr0_sgpr1 + %0:_(p1) = COPY %sgpr0_sgpr1 + %1:_(s32) = G_LOAD %0 :: (load 4 from %ir.tmp1) +... Index: llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/shader-epilogs.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/shader-epilogs.ll +++ llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/shader-epilogs.ll @@ -0,0 +1,9 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefix=GCN %s + +; GCN-LABEL: vs_epilog +; GCN: s_endpgm + +define amdgpu_vs void @vs_epilog() { +main_body: + ret void +} Index: llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/smrd.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/smrd.ll +++ llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/smrd.ll @@ -0,0 +1,87 @@ +; FIXME: Need to add support for mubuf stores to enable this on SI. +; XUN: llc < %s -march=amdgcn -mcpu=SI -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=SIVI %s +; RUN: llc < %s -march=amdgcn -mcpu=bonaire -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefix=CI --check-prefix=GCN %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=SIVI %s + +; SMRD load with an immediate offset. +; GCN-LABEL: {{^}}smrd0: +; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01 +; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 +define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +entry: + %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 1 + %1 = load i32, i32 addrspace(2)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; SMRD load with the largest possible immediate offset. +; GCN-LABEL: {{^}}smrd1: +; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}} +; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc +define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +entry: + %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 255 + %1 = load i32, i32 addrspace(2)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; SMRD load with an offset greater than the largest possible immediate. +; GCN-LABEL: {{^}}smrd2: +; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400 +; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]] +; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 +; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 +; GCN: s_endpgm +define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +entry: + %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 256 + %1 = load i32, i32 addrspace(2)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; SMRD load with a 64-bit offset +; GCN-LABEL: {{^}}smrd3: +; FIXME: There are too many copies here because we don't fold immediates +; through REG_SEQUENCE +; XSI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b +; TODO: Add VI checks +; XGCN: s_endpgm +define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +entry: + %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32 + %1 = load i32, i32 addrspace(2)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; SMRD load with the largest possible immediate offset on VI +; GCN-LABEL: {{^}}smrd4: +; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc +; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] +; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff +; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc +define void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +entry: + %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143 + %1 = load i32, i32 addrspace(2)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; SMRD load with an offset greater than the largest possible immediate on VI +; GCN-LABEL: {{^}}smrd5: +; SIVI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000 +; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] +; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 +; GCN: s_endpgm +define void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +entry: + %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144 + %1 = load i32, i32 addrspace(2)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} +