Index: lib/Target/R600/AMDGPU.h =================================================================== --- lib/Target/R600/AMDGPU.h +++ lib/Target/R600/AMDGPU.h @@ -76,33 +76,34 @@ GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). CONSTANT_ADDRESS = 2, ///< Address space for constant memory LOCAL_ADDRESS = 3, ///< Address space for local memory. - REGION_ADDRESS = 4, ///< Address space for region memory. - ADDRESS_NONE = 5, ///< Address space for unknown memory. - PARAM_D_ADDRESS = 6, ///< Address space for direct addressible parameter memory (CONST0) - PARAM_I_ADDRESS = 7, ///< Address space for indirect addressible parameter memory (VTX1) + FLAT_ADDRESS = 4, ///< Address space for flat accesses to local, private or global. + REGION_ADDRESS = 5, ///< Address space for region memory. + ADDRESS_NONE = 6, ///< Address space for unknown memory. + PARAM_D_ADDRESS = 7, ///< Address space for direct addressible parameter memory (CONST0) + PARAM_I_ADDRESS = 8, ///< Address space for indirect addressible parameter memory (VTX1) // Do not re-order the CONSTANT_BUFFER_* enums. Several places depend on this // order to be able to dynamically index a constant buffer, for example: // // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx - CONSTANT_BUFFER_0 = 8, - CONSTANT_BUFFER_1 = 9, - CONSTANT_BUFFER_2 = 10, - CONSTANT_BUFFER_3 = 11, - CONSTANT_BUFFER_4 = 12, - CONSTANT_BUFFER_5 = 13, - CONSTANT_BUFFER_6 = 14, - CONSTANT_BUFFER_7 = 15, - CONSTANT_BUFFER_8 = 16, - CONSTANT_BUFFER_9 = 17, - CONSTANT_BUFFER_10 = 18, - CONSTANT_BUFFER_11 = 19, - CONSTANT_BUFFER_12 = 20, - CONSTANT_BUFFER_13 = 21, - CONSTANT_BUFFER_14 = 22, - CONSTANT_BUFFER_15 = 23, - LAST_ADDRESS = 24 + CONSTANT_BUFFER_0 = 9, + CONSTANT_BUFFER_1 = 10, + CONSTANT_BUFFER_2 = 11, + CONSTANT_BUFFER_3 = 12, + CONSTANT_BUFFER_4 = 13, + CONSTANT_BUFFER_5 = 14, + CONSTANT_BUFFER_6 = 15, + CONSTANT_BUFFER_7 = 16, + CONSTANT_BUFFER_8 = 17, + CONSTANT_BUFFER_9 = 18, + CONSTANT_BUFFER_10 = 19, + CONSTANT_BUFFER_11 = 20, + CONSTANT_BUFFER_12 = 21, + CONSTANT_BUFFER_13 = 22, + CONSTANT_BUFFER_14 = 23, + CONSTANT_BUFFER_15 = 24, + LAST_ADDRESS = 25 }; } // namespace AMDGPUAS Index: lib/Target/R600/AMDGPU.td =================================================================== --- lib/Target/R600/AMDGPU.td +++ lib/Target/R600/AMDGPU.td @@ -68,6 +68,11 @@ "true", "GPU has CF_ALU bug">; +def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space", + "FlatAddressSpace", + "true", + "Support flat address space">; + class SubtargetFeatureFetchLimit : SubtargetFeature <"fetch"#Value, "TexVTXClauseSize", @@ -108,7 +113,7 @@ [Feature64BitPtr, FeatureFP64]>; def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS", - [Feature64BitPtr, FeatureFP64]>; + [Feature64BitPtr, FeatureFP64, FeatureFlatAddressSpace]>; //===----------------------------------------------------------------------===// def AMDGPUInstrInfo : InstrInfo { Index: lib/Target/R600/AMDGPUAsmPrinter.h =================================================================== --- lib/Target/R600/AMDGPUAsmPrinter.h +++ lib/Target/R600/AMDGPUAsmPrinter.h @@ -24,15 +24,19 @@ class AMDGPUAsmPrinter : public AsmPrinter { private: struct SIProgramInfo { - SIProgramInfo() : NumSGPR(0), NumVGPR(0) {} + SIProgramInfo() : NumSGPR(0), + NumVGPR(0), + VCCUsed(false), + FlatUsed(false) {} unsigned NumSGPR; unsigned NumVGPR; + bool VCCUsed; + bool FlatUsed; }; void getSIProgramInfo(SIProgramInfo &Out, MachineFunction &MF) const; - void findNumUsedRegistersSI(MachineFunction &MF, - unsigned &NumSGPR, - unsigned &NumVGPR) const; + void findUsedRegistersSI(MachineFunction &MF, + SIProgramInfo &Out) const; /// \brief Emit register usage information so that the GPU driver /// can correctly setup the GPU state. Index: lib/Target/R600/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/R600/AMDGPUAsmPrinter.cpp +++ lib/Target/R600/AMDGPUAsmPrinter.cpp @@ -66,7 +66,7 @@ const AMDGPUSubtarget &STM = TM.getSubtarget(); SIProgramInfo KernelInfo; if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) { - findNumUsedRegistersSI(MF, KernelInfo.NumSGPR, KernelInfo.NumVGPR); + findUsedRegistersSI(MF, KernelInfo); EmitProgramInfoSI(MF, KernelInfo); } else { EmitProgramInfoR600(MF); @@ -186,14 +186,14 @@ } } -void AMDGPUAsmPrinter::findNumUsedRegistersSI(MachineFunction &MF, - unsigned &NumSGPR, - unsigned &NumVGPR) const { +void AMDGPUAsmPrinter::findUsedRegistersSI(MachineFunction &MF, + SIProgramInfo &Out) const { unsigned MaxSGPR = 0; unsigned MaxVGPR = 0; bool VCCUsed = false; - const SIRegisterInfo * RI = - static_cast(TM.getRegisterInfo()); + bool FlatUsed = false; + const SIRegisterInfo *RI + = static_cast(TM.getRegisterInfo()); for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); BB != BB_E; ++BB) { @@ -215,6 +215,10 @@ if (reg == AMDGPU::VCC) { VCCUsed = true; continue; + } else if (reg == AMDGPU::FLAT_SCRATCH_SIZE || + reg == AMDGPU::FLAT_SCRATCH_OFFSET) { + FlatUsed = true; + continue; } switch (reg) { @@ -275,13 +279,18 @@ if (VCCUsed) MaxSGPR += 2; - NumSGPR = MaxSGPR; - NumVGPR = MaxVGPR; + if (FlatUsed) + MaxSGPR += 2; + + Out.NumSGPR = MaxSGPR; + Out.NumVGPR = MaxVGPR; + Out.VCCUsed = VCCUsed; + Out.FlatUsed = FlatUsed; } void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &Out, MachineFunction &MF) const { - findNumUsedRegistersSI(MF, Out.NumSGPR, Out.NumVGPR); + findUsedRegistersSI(MF, Out); } void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF, @@ -316,6 +325,7 @@ if (MFI->ShaderType == ShaderType::COMPUTE) { OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4); OutStreamer.EmitIntValue(S_00B84C_LDS_SIZE(LDSBlocks), 4); + // TODO: Should probably note flat usage somewhere } if (MFI->ShaderType == ShaderType::PIXEL) { OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); Index: lib/Target/R600/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/R600/AMDGPUISelDAGToDAG.cpp +++ lib/Target/R600/AMDGPUISelDAGToDAG.cpp @@ -61,10 +61,12 @@ SDValue SimplifyI24(SDValue &Op); bool SelectI24(SDValue Addr, SDValue &Op); bool SelectU24(SDValue Addr, SDValue &Op); + SDNode *SelectAddrSpaceCast(SDNode *N); static bool checkType(const Value *ptr, unsigned int addrspace); static bool isGlobalStore(const StoreSDNode *N); + static bool isFlatStore(const StoreSDNode *N); static bool isPrivateStore(const StoreSDNode *N); static bool isLocalStore(const StoreSDNode *N); static bool isRegionStore(const StoreSDNode *N); @@ -72,6 +74,7 @@ bool isCPLoad(const LoadSDNode *N) const; bool isConstantLoad(const LoadSDNode *N, int cbID) const; bool isGlobalLoad(const LoadSDNode *N) const; + bool isFlatLoad(const LoadSDNode *N) const; bool isParamLoad(const LoadSDNode *N) const; bool isPrivateLoad(const LoadSDNode *N) const; bool isLocalLoad(const LoadSDNode *N) const; @@ -343,6 +346,9 @@ CurDAG->getVTList(MVT::Other), Ops); } + + case ISD::ADDRSPACECAST: + return SelectAddrSpaceCast(N); } return SelectCode(N); } @@ -370,6 +376,10 @@ return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS); } +bool AMDGPUDAGToDAGISel::isFlatStore(const StoreSDNode *N) { + return checkType(N->getSrcValue(), AMDGPUAS::FLAT_ADDRESS); +} + bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) { return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS); } @@ -400,6 +410,10 @@ return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS); } +bool AMDGPUDAGToDAGISel::isFlatLoad(const LoadSDNode *N) const { + return checkType(N->getSrcValue(), AMDGPUAS::FLAT_ADDRESS); +} + bool AMDGPUDAGToDAGISel::isRegionLoad(const LoadSDNode *N) const { return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS); } @@ -428,6 +442,7 @@ } if (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS) && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS) + && !checkType(N->getSrcValue(), AMDGPUAS::FLAT_ADDRESS) && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS) && !checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS) && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_D_ADDRESS) @@ -558,6 +573,58 @@ return false; } +SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { + AddrSpaceCastSDNode *ASC = cast(N); + SDLoc DL(N); + + assert(Subtarget.hasFlatAddressSpace() && + "addrspacecast only supported with flat address space!"); + + assert((ASC->getSrcAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS && + ASC->getDestAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) && + "Cannot cast address space to / from constant address!"); + + assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS || + ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) && + "Can only cast to / from flat address space!"); + + // The flat instructions read the address as the index of the VGPR holding the + // address, so casting should just be reinterpreting the base VGPR, so just + // insert trunc / bitcast / zext. + + SDValue Src = ASC->getOperand(0); + EVT DestVT = ASC->getValueType(0); + EVT SrcVT = Src.getValueType(); + + unsigned SrcSize = SrcVT.getSizeInBits(); + unsigned DestSize = DestVT.getSizeInBits(); + + if (SrcSize > DestSize) { + assert(SrcSize == 64 && DestSize == 32); + return CurDAG->getMachineNode( + TargetOpcode::EXTRACT_SUBREG, + DL, + DestVT, + Src, + CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32)); + } + + + if (DestSize > SrcSize) { + assert(SrcSize == 32 && DestSize == 64); + return CurDAG->getMachineNode( + TargetOpcode::SUBREG_TO_REG, + DL, + DestVT, + CurDAG->getTargetConstant(0, MVT::i32), + Src, + CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32)); + } + + assert(SrcSize == 64 && DestSize == 64); + return CurDAG->getNode(ISD::BITCAST, DL, DestVT, Src).getNode(); +} + void AMDGPUDAGToDAGISel::PostprocessISelDAG() { const AMDGPUTargetLowering& Lowering = (*(const AMDGPUTargetLowering*)getTargetLowering()); Index: lib/Target/R600/AMDGPUInstrInfo.h =================================================================== --- lib/Target/R600/AMDGPUInstrInfo.h +++ lib/Target/R600/AMDGPUInstrInfo.h @@ -100,6 +100,7 @@ MachineInstr *MI, const SmallVectorImpl &Ops, MachineInstr *LoadMI) const; +public: /// \returns the smallest register index that will be accessed by an indirect /// read or write or -1 if indirect addressing is not used by this program. virtual int getIndirectIndexBegin(const MachineFunction &MF) const; @@ -108,7 +109,6 @@ /// read or write or -1 if indirect addressing is not used by this program. virtual int getIndirectIndexEnd(const MachineFunction &MF) const; -public: bool canFoldMemoryOperand(const MachineInstr *MI, const SmallVectorImpl &Ops) const; bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, Index: lib/Target/R600/AMDGPUInstructions.td =================================================================== --- lib/Target/R600/AMDGPUInstructions.td +++ lib/Target/R600/AMDGPUInstructions.td @@ -133,6 +133,14 @@ return isGlobalLoad(dyn_cast(N)); }]>; +def az_extloadi8_flat : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ + return isFlatLoad(dyn_cast(N)); +}]>; + +def sextloadi8_flat : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ + return isFlatLoad(dyn_cast(N)); +}]>; + def az_extloadi8_constant : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ return isConstantLoad(dyn_cast(N), -1); }]>; @@ -161,6 +169,14 @@ return isGlobalLoad(dyn_cast(N)); }]>; +def az_extloadi16_flat : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ + return isFlatLoad(dyn_cast(N)); +}]>; + +def sextloadi16_flat : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ + return isFlatLoad(dyn_cast(N)); +}]>; + def az_extloadi16_constant : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ return isConstantLoad(dyn_cast(N), -1); }]>; @@ -186,6 +202,11 @@ return isGlobalLoad(dyn_cast(N)); }]>; +def az_extloadi32_flat : PatFrag<(ops node:$ptr), + (az_extloadi32 node:$ptr), [{ + return isFlatLoad(dyn_cast(N)); +}]>; + def az_extloadi32_constant : PatFrag<(ops node:$ptr), (az_extloadi32 node:$ptr), [{ return isConstantLoad(dyn_cast(N), -1); @@ -201,6 +222,16 @@ return isGlobalStore(dyn_cast(N)); }]>; +def truncstorei8_flat : PatFrag<(ops node:$val, node:$ptr), + (truncstorei8 node:$val, node:$ptr), [{ + return isFlatStore(dyn_cast(N)); +}]>; + +def truncstorei16_flat : PatFrag<(ops node:$val, node:$ptr), + (truncstorei16 node:$val, node:$ptr), [{ + return isFlatStore(dyn_cast(N)); +}]>; + def local_store : PatFrag<(ops node:$val, node:$ptr), (store node:$val, node:$ptr), [{ return isLocalStore(dyn_cast(N)); @@ -235,6 +266,11 @@ return dyn_cast(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; }]>; +def mskor_flat : PatFrag<(ops node:$val, node:$ptr), + (AMDGPUstore_mskor node:$val, node:$ptr), [{ + return dyn_cast(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS; +}]>; + class Constants { int TWO_PI = 0x40c90fdb; int PI = 0x40490fdb; Index: lib/Target/R600/AMDGPUMachineFunction.h =================================================================== --- lib/Target/R600/AMDGPUMachineFunction.h +++ lib/Target/R600/AMDGPUMachineFunction.h @@ -28,6 +28,8 @@ std::map LocalMemoryObjects; /// Number of bytes in the LDS that are being used. unsigned LDSSize; + unsigned ScratchSize; + bool IsKernel; }; } Index: lib/Target/R600/AMDGPUMachineFunction.cpp =================================================================== --- lib/Target/R600/AMDGPUMachineFunction.cpp +++ lib/Target/R600/AMDGPUMachineFunction.cpp @@ -10,9 +10,11 @@ void AMDGPUMachineFunction::anchor() {} AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : - MachineFunctionInfo() { - ShaderType = ShaderType::COMPUTE; - LDSSize = 0; + MachineFunctionInfo(), + ShaderType(ShaderType::COMPUTE), + LDSSize(0), + ScratchSize(0), + IsKernel(true) { AttributeSet Set = MF.getFunction()->getAttributes(); Attribute A = Set.getAttribute(AttributeSet::FunctionIndex, ShaderTypeAttribute); Index: lib/Target/R600/AMDGPUSubtarget.h =================================================================== --- lib/Target/R600/AMDGPUSubtarget.h +++ lib/Target/R600/AMDGPUSubtarget.h @@ -49,6 +49,7 @@ enum Generation Gen; bool FP64; bool CaymanISA; + bool FlatAddressSpace; bool EnableIRStructurizer; bool EnableIfCvt; unsigned WavefrontSize; @@ -68,6 +69,9 @@ enum Generation getGeneration() const; bool hasHWFP64() const; bool hasCaymanISA() const; + bool hasFlatAddressSpace() const { + return FlatAddressSpace; + } bool IsIRStructurizerEnabled() const; bool isIfCvtEnabled() const; unsigned getWavefrontSize() const; Index: lib/Target/R600/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/R600/AMDGPUSubtarget.cpp +++ lib/Target/R600/AMDGPUSubtarget.cpp @@ -36,6 +36,7 @@ Gen = AMDGPUSubtarget::R600; FP64 = false; CaymanISA = false; + FlatAddressSpace = false; EnableIRStructurizer = true; EnableIfCvt = true; WavefrontSize = 0; Index: lib/Target/R600/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/R600/AMDGPUTargetMachine.cpp +++ lib/Target/R600/AMDGPUTargetMachine.cpp @@ -53,8 +53,9 @@ std::string Ret = "e-p:32:32"; if (ST.is64bit()) { - // 32-bit private, local, and region pointers. 64-bit global and constant. - Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:64:64"; + // 32-bit private, local, and region pointers. 64-bit global, flat and + // constant. + Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p6:64:64"; } Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256" Index: lib/Target/R600/AMDILInstrInfo.td =================================================================== --- lib/Target/R600/AMDILInstrInfo.td +++ lib/Target/R600/AMDILInstrInfo.td @@ -74,6 +74,12 @@ return isGlobalStore(dyn_cast(N)); }]>; +def flat_store : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return isFlatStore(dyn_cast(N)); +}]>; + + //===----------------------------------------------------------------------===// // Load pattern fragments //===----------------------------------------------------------------------===// @@ -81,6 +87,10 @@ def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ return isGlobalLoad(dyn_cast(N)); }]>; +// Flat address space loads +def flat_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isFlatLoad(dyn_cast(N)); +}]>; // Constant address space loads def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ return isConstantLoad(dyn_cast(N), -1); Index: lib/Target/R600/SIInstrFormats.td =================================================================== --- lib/Target/R600/SIInstrFormats.td +++ lib/Target/R600/SIInstrFormats.td @@ -446,6 +446,36 @@ let MIMG = 1; } +class FLAT op, dag outs, dag ins, string asm, list pattern> : + Enc64 { + bits<8> addr; + bits<8> data; + bits<8> vdst; + bits<1> slc; + bits<1> glc; + bits<1> tfe; + + // 15-0 is reserved. + let Inst{16} = glc; + let Inst{17} = slc; + let Inst{24-18} = op; + let Inst{31-26} = 0x37; // Encoding. + let Inst{39-32} = addr; + let Inst{47-40} = data; + // 54-48 is reserved. + let Inst{55} = tfe; + let Inst{63-56} = vdst; + + // Internally, FLAT instruction are executed as both an LDS and a + // Buffer instruction; so, they increment both VM_CNT and LGKM_CNT + // and are not considered done until both have been decremented. + let VM_CNT = 1; + let EXP_CNT = 1; // XXX - Need this? + let LGKM_CNT = 1; + + let neverHasSideEffects = 1; +} + def EXP : Enc64< (outs), (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm, Index: lib/Target/R600/SIInstrInfo.cpp =================================================================== --- lib/Target/R600/SIInstrInfo.cpp +++ lib/Target/R600/SIInstrInfo.cpp @@ -383,6 +383,11 @@ if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) ++ConstantBusCount; + // XXX - I'm sort of guessing about this. + if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCRATCH_SIZE || + MO.getReg() == AMDGPU::FLAT_SCRATCH_OFFSET)) + ++ConstantBusCount; + // SGPRs use the constant bus if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC || (!MO.isImplicit() && Index: lib/Target/R600/SIInstrInfo.td =================================================================== --- lib/Target/R600/SIInstrInfo.td +++ lib/Target/R600/SIInstrInfo.td @@ -132,6 +132,8 @@ def SIOperand { int ZERO = 0x80; int VCC = 0x6A; + int FLAT_SCRATCH_OFFSET = 0x68; + int FLAT_SCRATCH_SIZE = 0x69; } include "SIInstrFormats.td" @@ -490,6 +492,31 @@ let soffset = 128; // ZERO } +class FLAT_Load_Helper op, string asm, RegisterClass regClass> : + FLAT { + let glc = 0; + let slc = 0; + let tfe = 0; + let mayLoad = 1; + let Uses = [EXEC, M0, FLAT_SCRATCH_OFFSET, FLAT_SCRATCH_SIZE]; +} + +class FLAT_Store_Helper op, string name, RegisterClass vdataClass> : + FLAT { + + let mayLoad = 0; + let mayStore = 1; + + // Encoding + let glc = 0; + let slc = 0; + let tfe = 0; +} + class MTBUF_Load_Helper op, string asm, RegisterClass regClass> : MTBUF < op, (outs regClass:$dst), Index: lib/Target/R600/SIInstructions.td =================================================================== --- lib/Target/R600/SIInstructions.td +++ lib/Target/R600/SIInstructions.td @@ -29,6 +29,11 @@ def isSI : Predicate<"Subtarget.getGeneration() " ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">; +def isCI : Predicate<"Subtarget.getGeneration() " + ">= AMDGPUSubtarget::SEA_ISLANDS">; +def HasFlatAddressSpace : Predicate<"Subtarget.hasFlatAddressSpace()">; + + def WAIT_FLAG : InstFlag<"printWaitFlag">; let Predicates = [isSI] in { @@ -491,6 +496,78 @@ def TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "TBUFFER_STORE_FORMAT_XYZ", VReg_128>; def TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "TBUFFER_STORE_FORMAT_XYZW", VReg_128>; +let Predicates = [HasFlatAddressSpace] in { +def FLAT_LOAD_UBYTE : FLAT_Load_Helper <0x00000008, "FLAT_LOAD_UBYTE", VReg_32>; +def FLAT_LOAD_SBYTE : FLAT_Load_Helper <0x00000009, "FLAT_LOAD_SBYTE", VReg_32>; +def FLAT_LOAD_USHORT : FLAT_Load_Helper <0x0000000a, "FLAT_LOAD_USHORT", VReg_32>; +def FLAT_LOAD_SSHORT : FLAT_Load_Helper <0x0000000b, "FLAT_LOAD_SSHORT", VReg_32>; +def FLAT_LOAD_DWORD : FLAT_Load_Helper <0x0000000c, "FLAT_LOAD_DWORD", VReg_32>; +def FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <0x0000000d, "FLAT_LOAD_DWORDX2", VReg_64>; +def FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <0x0000000e, "FLAT_LOAD_DWORDX4", VReg_128>; +def FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <0x00000010, "FLAT_LOAD_DWORDX3", VReg_96>; + +def FLAT_STORE_BYTE : FLAT_Store_Helper < + 0x00000018, "FLAT_STORE_BYTE", VReg_32 +>; + +def FLAT_STORE_SHORT : FLAT_Store_Helper < + 0x0000001a, "FLAT_STORE_SHORT", VReg_32 +>; + +def FLAT_STORE_DWORD : FLAT_Store_Helper < + 0x0000001c, "FLAT_STORE_DWORD", VReg_32 +>; + +def FLAT_STORE_DWORDX2 : FLAT_Store_Helper < + 0x0000001d, "FLAT_STORE_DWORDX2", VReg_64 +>; + +def FLAT_STORE_DWORDX4 : FLAT_Store_Helper < + 0x0000001e, "FLAT_STORE_DWORDX4", VReg_128 +>; + +def FLAT_STORE_DWORDX3 : FLAT_Store_Helper < + 0x0000001e, "FLAT_STORE_DWORDX3", VReg_96 +>; + +//def FLAT_ATOMIC_SWAP : FLAT_ <0x00000030, "FLAT_ATOMIC_SWAP", []>; +//def FLAT_ATOMIC_CMPSWAP : FLAT_ <0x00000031, "FLAT_ATOMIC_CMPSWAP", []>; +//def FLAT_ATOMIC_ADD : FLAT_ <0x00000032, "FLAT_ATOMIC_ADD", []>; +//def FLAT_ATOMIC_SUB : FLAT_ <0x00000033, "FLAT_ATOMIC_SUB", []>; +//def FLAT_ATOMIC_RSUB : FLAT_ <0x00000034, "FLAT_ATOMIC_RSUB", []>; +//def FLAT_ATOMIC_SMIN : FLAT_ <0x00000035, "FLAT_ATOMIC_SMIN", []>; +//def FLAT_ATOMIC_UMIN : FLAT_ <0x00000036, "FLAT_ATOMIC_UMIN", []>; +//def FLAT_ATOMIC_SMAX : FLAT_ <0x00000037, "FLAT_ATOMIC_SMAX", []>; +//def FLAT_ATOMIC_UMAX : FLAT_ <0x00000038, "FLAT_ATOMIC_UMAX", []>; +//def FLAT_ATOMIC_AND : FLAT_ <0x00000039, "FLAT_ATOMIC_AND", []>; +//def FLAT_ATOMIC_OR : FLAT_ <0x0000003a, "FLAT_ATOMIC_OR", []>; +//def FLAT_ATOMIC_XOR : FLAT_ <0x0000003b, "FLAT_ATOMIC_XOR", []>; +//def FLAT_ATOMIC_INC : FLAT_ <0x0000003c, "FLAT_ATOMIC_INC", []>; +//def FLAT_ATOMIC_DEC : FLAT_ <0x0000003d, "FLAT_ATOMIC_DEC", []>; +//def FLAT_ATOMIC_FCMPSWAP : FLAT_ <0x0000003e, "FLAT_ATOMIC_FCMPSWAP", []>; +//def FLAT_ATOMIC_FMIN : FLAT_ <0x0000003f, "FLAT_ATOMIC_FMIN", []>; +//def FLAT_ATOMIC_FMAX : FLAT_ <0x00000040, "FLAT_ATOMIC_FMAX", []>; +//def FLAT_ATOMIC_SWAP_X2 : FLAT_X2 <0x00000050, "FLAT_ATOMIC_SWAP_X2", []>; +//def FLAT_ATOMIC_CMPSWAP_X2 : FLAT_X2 <0x00000051, "FLAT_ATOMIC_CMPSWAP_X2", []>; +//def FLAT_ATOMIC_ADD_X2 : FLAT_X2 <0x00000052, "FLAT_ATOMIC_ADD_X2", []>; +//def FLAT_ATOMIC_SUB_X2 : FLAT_X2 <0x00000053, "FLAT_ATOMIC_SUB_X2", []>; +//def FLAT_ATOMIC_RSUB_X2 : FLAT_X2 <0x00000054, "FLAT_ATOMIC_RSUB_X2", []>; +//def FLAT_ATOMIC_SMIN_X2 : FLAT_X2 <0x00000055, "FLAT_ATOMIC_SMIN_X2", []>; +//def FLAT_ATOMIC_UMIN_X2 : FLAT_X2 <0x00000056, "FLAT_ATOMIC_UMIN_X2", []>; +//def FLAT_ATOMIC_SMAX_X2 : FLAT_X2 <0x00000057, "FLAT_ATOMIC_SMAX_X2", []>; +//def FLAT_ATOMIC_UMAX_X2 : FLAT_X2 <0x00000058, "FLAT_ATOMIC_UMAX_X2", []>; +//def FLAT_ATOMIC_AND_X2 : FLAT_X2 <0x00000059, "FLAT_ATOMIC_AND_X2", []>; +//def FLAT_ATOMIC_OR_X2 : FLAT_X2 <0x0000005a, "FLAT_ATOMIC_OR_X2", []>; +//def FLAT_ATOMIC_XOR_X2 : FLAT_X2 <0x0000005b, "FLAT_ATOMIC_XOR_X2", []>; +//def FLAT_ATOMIC_INC_X2 : FLAT_X2 <0x0000005c, "FLAT_ATOMIC_INC_X2", []>; +//def FLAT_ATOMIC_DEC_X2 : FLAT_X2 <0x0000005d, "FLAT_ATOMIC_DEC_X2", []>; +//def FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_X2 <0x0000005e, "FLAT_ATOMIC_FCMPSWAP_X2", []>; +//def FLAT_ATOMIC_FMIN_X2 : FLAT_X2 <0x0000005f, "FLAT_ATOMIC_FMIN_X2", []>; +//def FLAT_ATOMIC_FMAX_X2 : FLAT_X2 <0x00000060, "FLAT_ATOMIC_FMAX_X2", []>; + +} // End HasFlatAddressSpace predicate + + let mayLoad = 1 in { // We are using the SGPR_32 and not the SReg_32 register class for 32-bit @@ -2084,6 +2161,39 @@ def : MTBUF_StoreResource ; def : MTBUF_StoreResource ; +//===----------------------------------------------------------------------===// +// Flat Patterns +//===----------------------------------------------------------------------===// + +class FLATLoad_Pattern : + Pat <(vt (flat_ld i64:$ptr)), + (Instr_ADDR64 $ptr) +>; + +def : FLATLoad_Pattern ; +def : FLATLoad_Pattern ; +def : FLATLoad_Pattern ; +def : FLATLoad_Pattern ; +def : FLATLoad_Pattern ; +def : FLATLoad_Pattern ; +def : FLATLoad_Pattern ; +def : FLATLoad_Pattern ; +def : FLATLoad_Pattern ; + +class FLATStore_Pattern : + Pat <(st vt:$value, i64:$ptr), + (Instr $value, $ptr) + >; + +def : FLATStore_Pattern ; +def : FLATStore_Pattern ; +def : FLATStore_Pattern ; +def : FLATStore_Pattern ; +def : FLATStore_Pattern ; +def : FLATStore_Pattern ; + + /********** ====================== **********/ /********** Indirect adressing **********/ /********** ====================== **********/ Index: lib/Target/R600/SILowerControlFlow.cpp =================================================================== --- lib/Target/R600/SILowerControlFlow.cpp +++ lib/Target/R600/SILowerControlFlow.cpp @@ -51,6 +51,7 @@ #include "AMDGPU.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -415,6 +416,7 @@ bool HaveKill = false; bool NeedM0 = false; bool NeedWQM = false; + bool NeedFlat = false; unsigned Depth = 0; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); @@ -500,6 +502,24 @@ NeedWQM = true; break; + case AMDGPU::FLAT_LOAD_DWORD: + case AMDGPU::FLAT_LOAD_DWORDX2: + case AMDGPU::FLAT_LOAD_DWORDX3: + case AMDGPU::FLAT_LOAD_DWORDX4: + case AMDGPU::FLAT_LOAD_SBYTE: + case AMDGPU::FLAT_LOAD_SSHORT: + case AMDGPU::FLAT_LOAD_UBYTE: + case AMDGPU::FLAT_LOAD_USHORT: + case AMDGPU::FLAT_STORE_BYTE: + case AMDGPU::FLAT_STORE_DWORD: + case AMDGPU::FLAT_STORE_DWORDX2: + case AMDGPU::FLAT_STORE_DWORDX3: + case AMDGPU::FLAT_STORE_DWORDX4: + case AMDGPU::FLAT_STORE_SHORT: + // TODO: atomics and other flat instructions + NeedFlat = true; + break; + } } } @@ -518,5 +538,39 @@ AMDGPU::EXEC).addReg(AMDGPU::EXEC); } + // FIXME: This seems inappropriate to do here. + if (NeedFlat && MFI->IsKernel) { + // Insert the prologue initializing the SGPRs pointing to the scratch space + // for flat accesses. + const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + + // TODO: What to use with function calls? + unsigned StackSizeBytes = FrameInfo->getStackSize(); + + int IndirectBegin = static_cast(TII)->getIndirectIndexBegin(MF); + + // Convert register index to 256-byte unit. + // XXX - Does it mean bits? 256-bytes seems wrong. + unsigned StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256); + + assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff && + "Stack limits should be smaller than 16-bits"); + + // Initialize the flat scratch register pair. + + // Offset is in units of 256-bytes. + MachineBasicBlock &MBB = MF.front(); + BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), + TII->get(AMDGPU::S_MOVK_I32), + AMDGPU::FLAT_SCRATCH_OFFSET).addImm(StackOffset); + + // XXX - Documentation says size is "per-thread scratch size in bytes", but + // that's crazy. Maybe it means per wave? + BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), + TII->get(AMDGPU::S_MOVK_I32), + AMDGPU::FLAT_SCRATCH_SIZE).addImm(StackSizeBytes); + + } + return true; } Index: lib/Target/R600/SIRegisterInfo.td =================================================================== --- lib/Target/R600/SIRegisterInfo.td +++ lib/Target/R600/SIRegisterInfo.td @@ -17,11 +17,20 @@ } // Special Registers + +// Pair to indicate location of scratch space for flat accesses. +// Offset is in units of 256-bytes, +def FLAT_SCRATCH_OFFSET : SIReg <"FLAT_SCRATCH_OFFSET", 104>; + +// Size is the per-thread scratch size, in bytes. +def FLAT_SCRATCH_SIZE : SIReg <"FLAT_SCRATCH_SIZE", 105>; + def VCC : SIReg<"VCC", 106>; def EXEC : SIReg<"EXEC", 126>; def SCC : SIReg<"SCC", 253>; def M0 : SIReg <"M0", 124>; + // SGPR registers foreach Index = 0-101 in { def SGPR#Index : SIReg <"SGPR"#Index, Index>; Index: test/CodeGen/R600/flat-address-space.ll =================================================================== --- /dev/null +++ test/CodeGen/R600/flat-address-space.ll @@ -0,0 +1,182 @@ +; RUN: llc -O0 -march=r600 -mcpu=bonaire < %s | FileCheck %s + +; Disable optimizations in case there are optimizations added that +; specialize away generic pointer accesses. + + +; CHECK-LABEL: @branch_use_flat_i32: +; CHECK: ; BB#3: ; %global + +; CHECK: V_MOV_B32_e32 v[[LO_VREG:[0-1]+]], {{s[0-9]+}} +; CHECK: V_MOV_B32_e32 v[[HI_VREG:[0-1]+]], {{s[0-9]+}} + +; CHECK: ; BB#2: ; %local + +; CHECK: V_MOV_B32_e32 v[[LO_VREG]], {{s[0-9]+}} +; CHECK: V_MOV_B32_e32 v[[HI_VREG]], {{s[0-9]+}} + +; CHECK: FLAT_STORE_DWORD {{v[0-9]+}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} +define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 { +entry: + %cmp = icmp ne i32 %c, 0 + br i1 %cmp, label %local, label %global + +local: + %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32 addrspace(4)* + br label %end + +global: + %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* + br label %end + +end: + %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ] + store i32 %x, i32 addrspace(4)* %fptr, align 4 +; %val = load i32 addrspace(4)* %fptr, align 4 +; store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + + + +; These testcases might become useless when there are optimizations to +; remove generic pointers. + +; CHECK-LABEL: @store_flat_i32: +; CHECK: V_MOV_B32_e32 v[[DATA:[0-9]+]], {{s[0-9]+}} +; CHECK: V_MOV_B32_e32 v[[LO_VREG:[0-9]+]], {{s[0-9]+}} +; CHECK: V_MOV_B32_e32 v[[HI_VREG:[0-9]+]], {{s[0-9]+}} +; CHECK: FLAT_STORE_DWORD v[[DATA]], v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} +define void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 { + %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* + store i32 %x, i32 addrspace(4)* %fptr, align 4 + ret void +} + +; CHECK-LABEL: @store_flat_i64: +; CHECK: FLAT_STORE_DWORDX2 +define void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 { + %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)* + store i64 %x, i64 addrspace(4)* %fptr, align 8 + ret void +} + +; CHECK-LABEL: @store_flat_v4i32: +; CHECK: FLAT_STORE_DWORDX4 +define void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 { + %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)* + store <4 x i32> %x, <4 x i32> addrspace(4)* %fptr, align 16 + ret void +} + +; CHECK-LABEL: @store_flat_trunc_i16: +; CHECK: FLAT_STORE_SHORT +define void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 { + %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)* + %y = trunc i32 %x to i16 + store i16 %y, i16 addrspace(4)* %fptr, align 2 + ret void +} + +; CHECK-LABEL: @store_flat_trunc_i8: +; CHECK: FLAT_STORE_BYTE +define void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 { + %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)* + %y = trunc i32 %x to i8 + store i8 %y, i8 addrspace(4)* %fptr, align 2 + ret void +} + + + +; CHECK-LABEL @load_flat_i32: +; CHECK: FLAT_LOAD_DWORD +define void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 { + %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* + %fload = load i32 addrspace(4)* %fptr, align 4 + store i32 %fload, i32 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL @load_flat_i64: +; CHECK: FLAT_LOAD_DWORDX2 +define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 { + %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)* + %fload = load i64 addrspace(4)* %fptr, align 4 + store i64 %fload, i64 addrspace(1)* %out, align 8 + ret void +} + +; CHECK-LABEL @load_flat_v4i32: +; CHECK: FLAT_LOAD_DWORDX4 +define void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 { + %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)* + %fload = load <4 x i32> addrspace(4)* %fptr, align 4 + store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8 + ret void +} + +; CHECK-LABEL @sextload_flat_i8: +; CHECK: FLAT_LOAD_SBYTE +define void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 { + %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)* + %fload = load i8 addrspace(4)* %fptr, align 4 + %ext = sext i8 %fload to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL @zextload_flat_i8: +; CHECK: FLAT_LOAD_UBYTE +define void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 { + %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)* + %fload = load i8 addrspace(4)* %fptr, align 4 + %ext = zext i8 %fload to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL @sextload_flat_i16: +; CHECK: FLAT_LOAD_SSHORT +define void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 { + %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)* + %fload = load i16 addrspace(4)* %fptr, align 4 + %ext = sext i16 %fload to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL @zextload_flat_i16: +; CHECK: FLAT_LOAD_USHORT +define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 { + %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)* + %fload = load i16 addrspace(4)* %fptr, align 4 + %ext = zext i16 %fload to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +declare void @llvm.AMDGPU.barrier.local() #1 + + +; Check for prologue initializing special SGPRs pointing to scratch. +; CHECK-LABEL: @store_flat_scratch: +; CHECK: S_MOVK_I32 FLAT_SCRATCH_SIZE, 40 +; CHECK: S_MOVK_I32 FLAT_SCRATCH_OFFSET, +; CHECK: FLAT_STORE_DWORD +; CHECK: S_BARRIER +; CHECK: FLAT_LOAD_DWORD +define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32 %x) #0 { + %alloca = alloca i32, i32 9, align 4 + %pptr = getelementptr i32* %alloca, i32 %x + %fptr = addrspacecast i32* %pptr to i32 addrspace(4)* + store i32 %x, i32 addrspace(4)* %fptr + ; Dummy call + call void @llvm.AMDGPU.barrier.local() #1 + %reload = load i32 addrspace(4)* %fptr, align 4 + store i32 %reload, i32 addrspace(1)* %out, align 4 + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind noduplicate }