Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -463,6 +463,14 @@ def int_amdgcn_buffer_load_format : AMDGPUBufferLoad; def int_amdgcn_buffer_load : AMDGPUBufferLoad; + +def int_amdgcn_s_buffer_load : Intrinsic < + [llvm_anyint_ty], + [LLVMQualPointerType, 42>, + llvm_i32_ty, // byte offset + llvm_i1_ty], // glc + [IntrReadMem, IntrArgMemOnly, NoCapture<0>]>; + class AMDGPUBufferStore : Intrinsic < [], [llvm_anyfloat_ty, // vdata(VGPR) -- can currently only select f32, v2f32, v4f32 Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -193,6 +193,7 @@ const static unsigned CONSTANT_BUFFER_14 = 22; const static unsigned CONSTANT_BUFFER_15 = 23; + const static unsigned CONSTANT_ADDRESS_W_RSRC = 42; // Some places use this if the address space can't be determined. const static unsigned UNKNOWN_ADDRESS_SPACE = ~0u; }; Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -389,6 +389,7 @@ ATOMIC_DEC, BUFFER_LOAD, BUFFER_LOAD_FORMAT, + SBUFFER_LOAD, LAST_AMDGPU_ISD_NUMBER }; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3532,6 +3532,7 @@ NODE_NAME_CASE(ATOMIC_DEC) NODE_NAME_CASE(BUFFER_LOAD) NODE_NAME_CASE(BUFFER_LOAD_FORMAT) + NODE_NAME_CASE(SBUFFER_LOAD) case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } return nullptr; Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -222,7 +222,7 @@ return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32" "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5"; - return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32" + return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p42:128:128" "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; } Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -114,6 +114,12 @@ } unsigned getVectorSplitCost() { return 0; } + + bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info); + + Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, + Type *ExpectedType); + }; } // end namespace llvm Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -203,6 +203,7 @@ AMDGPUAS AS = ST->getAMDGPUAS(); if (AddrSpace == AS.GLOBAL_ADDRESS || AddrSpace == AS.CONSTANT_ADDRESS || + AddrSpace == AS.CONSTANT_ADDRESS_W_RSRC || AddrSpace == AS.FLAT_ADDRESS) return 128; if (AddrSpace == AS.LOCAL_ADDRESS || @@ -479,3 +480,34 @@ return false; } + +bool AMDGPUTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, + MemIntrinsicInfo &Info) { + IRBuilder<> Builder(Inst); + switch (Inst->getIntrinsicID()) { + default: + return false; + case Intrinsic::amdgcn_s_buffer_load: + Info.ReadMem = true; + Info.WriteMem = false; + + // We can only set this if the intrinsic is functionally equivalent to a + // load/store. + if (auto Offset = dyn_cast(Inst->getArgOperand(1))) { + if (Offset->isZero() && + static_cast(Inst->getArgOperand(2))->isZero()) { + Info.PtrVal = Inst->getArgOperand(0); + } + } + break; + } + return true; +} + +Value *AMDGPUTTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, + Type *ExpectedType) { + if (Inst->getType() == ExpectedType) + return Inst; + + return nullptr; +} Index: lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- lib/Target/AMDGPU/BUFInstructions.td +++ lib/Target/AMDGPU/BUFInstructions.td @@ -709,8 +709,8 @@ // Offset in an 32-bit VGPR def : Pat < - (SIload_constant v4i32:$sbase, i32:$voff), - (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, (i32 0), 0, 0, 0, 0) + (i32 (SIsbuffer_load v4i32:$sbase, i32:$offset, i1:$glc)), + (BUFFER_LOAD_DWORD_OFFEN $offset, $sbase, (i32 0), 0, (as_i1imm $glc), 0, 0) >; Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -195,6 +195,8 @@ EVT VT) const override; MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override; bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; + void LowerOperationWrapper(SDNode *N, SmallVectorImpl &Results, + SelectionDAG &DAG) const override; SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; void ReplaceNodeResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const override; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -35,6 +35,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" +#include "llvm/Analysis/Loads.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/DAGCombine.h" @@ -544,6 +545,15 @@ Info.writeMem = true; return true; } + case Intrinsic::amdgcn_s_buffer_load: + Info.opc = AMDGPUISD::SBUFFER_LOAD; + Info.memVT = MVT::getVT(CI.getType()); + Info.ptrVal = CI.getOperand(0); + Info.align = 0; + Info.vol = false; + Info.readMem = true; + Info.writeMem = false; + return true; default: return false; } @@ -628,7 +638,8 @@ } return isLegalMUBUFAddressingMode(AM); - } else if (AS == AMDGPUASI.CONSTANT_ADDRESS) { + } else if (AS == AMDGPUASI.CONSTANT_ADDRESS || + AS == AMDGPUASI.CONSTANT_ADDRESS_W_RSRC) { // If the offset isn't a multiple of 4, it probably isn't going to be // correctly aligned. // FIXME: Can we get the real alignment here? @@ -2152,6 +2163,36 @@ // Custom DAG Lowering Operations //===----------------------------------------------------------------------===// +void SITargetLowering::LowerOperationWrapper(SDNode *N, + SmallVectorImpl &Results, + SelectionDAG &DAG) const { + + if (N->getOpcode() != AMDGPUISD::SBUFFER_LOAD) { + TargetLowering::LowerOperationWrapper(N, Results, DAG); + return; + } + + SDLoc DL(N); + MemSDNode *M = cast(N); + SDValue Ops[] = { + M->getOperand(0), // Chain + DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, M->getOperand(1)), // Ptr + M->getOperand(2), // Offset + DAG.getTargetConstant(cast( + M->getOperand(3))->getZExtValue(), DL, MVT::i1) // glc + }; + + auto MMO = M->getMemOperand(); + if (isDereferenceablePointer(MMO->getValue(), DAG.getDataLayout())) + MMO->setFlags(MachineMemOperand::MODereferenceable); + + SDValue LD = DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL, + M->getVTList(), Ops, M->getMemoryVT(), + M->getMemOperand()); + Results.push_back(LD); + Results.push_back(LD.getValue(1)); +} + SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); @@ -2970,8 +3011,10 @@ TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT); case AMDGPUIntrinsic::SI_load_const: { SDValue Ops[] = { - Op.getOperand(1), - Op.getOperand(2) + DAG.getEntryNode(), // Chain + Op.getOperand(1), // Ptr + Op.getOperand(2), // Offset + DAG.getTargetConstant(0, DL, MVT::i1) // glc }; MachineMemOperand *MMO = MF.getMachineMemOperand( @@ -2979,8 +3022,16 @@ MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant, VT.getStoreSize(), 4); - return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, - Op->getVTList(), Ops, VT, MMO); + SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other); + SDValue Load = DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL, + VTList, Ops, MVT::i32, MMO); + + SDValue MergeOps[] = { + DAG.getNode(ISD::BITCAST, DL, MVT::f32, Load), + Load.getValue(1) + }; + + return DAG.getMergeValues(MergeOps, DL); } case Intrinsic::amdgcn_fdiv_fast: return lowerFDIV_FAST(Op, DAG); Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -26,11 +26,6 @@ // SI DAG Nodes //===----------------------------------------------------------------------===// -def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT", - SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i32>]>, - [SDNPMayLoad, SDNPMemOperand] ->; - def SIatomic_inc : SDNode<"AMDGPUISD::ATOMIC_INC", SDTAtomic2, [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] >; @@ -92,6 +87,11 @@ defm atomic_inc_global : global_binary_atomic_op; defm atomic_dec_global : global_binary_atomic_op; +def SIsbuffer_load : SDNode<"AMDGPUISD::SBUFFER_LOAD", + SDTypeProfile<1, 3, [SDTCisVT<1, v4i32>, SDTCisVT<2, i32>, SDTCisVT<3, i1>]>, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand] +>; + //===----------------------------------------------------------------------===// // SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1 // to be glued to the memory instructions. Index: lib/Target/AMDGPU/SMInstructions.td =================================================================== --- lib/Target/AMDGPU/SMInstructions.td +++ lib/Target/AMDGPU/SMInstructions.td @@ -276,15 +276,16 @@ defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>; // 1. Offset as an immediate -def SM_LOAD_PATTERN : Pat < // name this pattern to reuse AddedComplexity on CI - (SIload_constant v4i32:$sbase, (SMRDBufferImm i32:$offset)), - (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset, 0) +// name this pattern to reuse AddedComplexity on CI +def SM_LOAD_PATTERN : Pat < + (i32 (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), i1:$glc)), + (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset, (as_i1imm $glc)) >; // 2. Offset loaded in an 32bit SGPR def : Pat < - (SIload_constant v4i32:$sbase, (SMRDBufferSgpr i32:$offset)), - (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset, 0) + (i32 (SIsbuffer_load v4i32:$sbase, (SMRDBufferSgpr i32:$offset), i1:$glc)), + (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset, (as_i1imm $glc)) >; } // End let AddedComplexity = 100 @@ -520,8 +521,8 @@ def : SMRD_Pattern_ci <"S_LOAD_DWORDX16", v16i32>; def : Pat < - (SIload_constant v4i32:$sbase, (SMRDBufferImm32 i32:$offset)), - (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset, 0)> { + (i32 (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), i1:$glc)), + (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset, $glc)> { let Predicates = [isCI]; // should this be isCIOnly? } Index: test/CodeGen/AMDGPU/mubuf.ll =================================================================== --- test/CodeGen/AMDGPU/mubuf.ll +++ test/CodeGen/AMDGPU/mubuf.ll @@ -84,6 +84,23 @@ ret void } +; Using the load.const intrinsic with an vgpr offset +; CHECK-LABEL: {{^}}s_buffer_load: +; CHECK: buffer_load_dword v{{[0-9+]}}, v{{[0-9+]}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen +; CHECK: buffer_load_dword v{{[0-9+]}}, v{{[0-9+]}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen +define amdgpu_ps void @s_buffer_load(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float, i32 addrspace(42)* addrspace(2)* inreg %in) { +main_body: + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 + %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 + %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 %tid) + %23 = load i32 addrspace(42)*, i32 addrspace(42)* addrspace(2)* %in + %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %23, i32 %tid, i1 false) + %s.buffer.float = bitcast i32 %s.buffer to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %22, float %22, float %22, float %s.buffer.float, i1 1, i1 0) + ret void +} + ;;;==========================================================================;;; ;;; MUBUF STORE TESTS ;;;==========================================================================;;; @@ -174,7 +191,12 @@ ret void } +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #0 declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) +declare float @llvm.SI.load.const(<16 x i8>, i32) #1 +declare i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* nocapture, i32, i1) +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) attributes #0 = { nounwind readonly } +attributes #1 = { nounwind readnone }