Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -326,6 +326,8 @@ ATOMIC_INC, ATOMIC_DEC, SBUFFER_LOAD, + BUFFER_LOAD, + BUFFER_LOAD_FORMAT, LAST_AMDGPU_ISD_NUMBER }; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3036,6 +3036,8 @@ NODE_NAME_CASE(ATOMIC_INC) NODE_NAME_CASE(ATOMIC_DEC) NODE_NAME_CASE(SBUFFER_LOAD) + NODE_NAME_CASE(BUFFER_LOAD) + NODE_NAME_CASE(BUFFER_LOAD_FORMAT) case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } return nullptr; Index: lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- lib/Target/AMDGPU/BUFInstructions.td +++ lib/Target/AMDGPU/BUFInstructions.td @@ -759,12 +759,12 @@ >; } -defm : MUBUF_LoadIntrinsicPat; -defm : MUBUF_LoadIntrinsicPat; -defm : MUBUF_LoadIntrinsicPat; -defm : MUBUF_LoadIntrinsicPat; -defm : MUBUF_LoadIntrinsicPat; -defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; multiclass MUBUF_StoreIntrinsicPat { Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -2693,6 +2693,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const { unsigned IntrID = cast(Op.getOperand(1))->getZExtValue(); + SDLoc DL(Op); switch (IntrID) { case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: { @@ -2708,6 +2709,31 @@ return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } + case Intrinsic::amdgcn_buffer_load: + case Intrinsic::amdgcn_buffer_load_format: { + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // rsrc + Op.getOperand(3), // vindex + Op.getOperand(4), // offset + Op.getOperand(5), // glc + Op.getOperand(6), // slc + }; + MachineFunction &MF = DAG.getMachineFunction(); + SIMachineFunctionInfo *MFI = MF.getInfo(); + + unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ? + AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; + EVT VT = Op.getValueType(); + EVT IntVT = VT.changeTypeToInteger(); + + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(MFI->getBufferPSV()), + MachineMemOperand::MOLoad, + VT.getStoreSize(), VT.getStoreSize()); + + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO); + } default: return SDValue(); } Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -53,6 +53,19 @@ [SDNPMayStore, SDNPMemOperand, SDNPHasChain] >; +def SDTBufferLoad : SDTypeProfile<1, 5, + [ // vdata + SDTCisVT<1, v4i32>, // rsrc + SDTCisVT<2, i32>, // vindex + SDTCisVT<3, i32>, // offset + SDTCisVT<4, i1>, // glc + SDTCisVT<5, i1>]>; // slc + +def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; + def SIload_input : SDNode<"AMDGPUISD::LOAD_INPUT", SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i16>, SDTCisVT<3, i32>]> Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -47,6 +47,29 @@ } }; +class AMDGPUBufferPseudoSourceValue : public PseudoSourceValue { +public: + explicit AMDGPUBufferPseudoSourceValue() : + PseudoSourceValue(PseudoSourceValue::TargetCustom) { } + + bool isConstant(const MachineFrameInfo *) const override { + // This should probably be true for most images, but we will start by being + // conservative. + return false; + } + + bool isAliased(const MachineFrameInfo *) const override { + // FIXME: If we ever change image intrinsics to accept fat pointers, then + // this could be true for some cases. + return false; + } + + bool mayAlias(const MachineFrameInfo*) const override { + // FIXME: If we ever change image intrinsics to accept fat pointers, then + // this could be true for some cases. + return false; + } +}; /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which /// tells the hardware which interpolation parameters to load. @@ -98,6 +121,7 @@ // Stack object indices for work item IDs. std::array DebuggerWorkItemIDStackObjectIndices; + std::unique_ptr BufferPSV; std::unique_ptr ImagePSV; public: @@ -462,6 +486,10 @@ llvm_unreachable("unexpected dimension"); } + AMDGPUBufferPseudoSourceValue *getBufferPSV() { + return BufferPSV.get(); + } + AMDGPUImagePseudoSourceValue *getImagePSV() { return ImagePSV.get(); } Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -52,6 +52,7 @@ WavesPerEU(0, 0), DebuggerWorkGroupIDStackObjectIndices({{0, 0, 0}}), DebuggerWorkItemIDStackObjectIndices({{0, 0, 0}}), + BufferPSV(llvm::make_unique()), ImagePSV(llvm::make_unique()), LDSWaveSpillSize(0), PSInputEna(0), Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll @@ -112,6 +112,20 @@ ret <4 x float> %data } +; SI won't merge ds memory operations, because of the signed offset bug, so +; we only have check lines for VI. +; VI-LABEL: buffer_load_mmo +; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 +; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4 +define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, float addrspace(3)* %lds) { +entry: + store float 0.0, float addrspace(3)* %lds + %val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) + %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4 + store float 0.0, float addrspace(3)* %tmp2 + ret float %val +} + declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0 declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0 declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0