Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -416,6 +416,14 @@ def int_amdgcn_buffer_load_format : AMDGPUBufferLoad; def int_amdgcn_buffer_load : AMDGPUBufferLoad; + +def int_amdgcn_s_buffer_load : Intrinsic < + [llvm_anyint_ty], + [LLVMQualPointerType, 42>, + llvm_i32_ty, // byte offset + llvm_i1_ty], // glc + [IntrReadMem, IntrArgMemOnly, NoCapture<0>]>; + class AMDGPUBufferStore : Intrinsic < [], [llvm_anyfloat_ty, // vdata(VGPR) -- can currently only select f32, v2f32, v4f32 Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -169,6 +169,7 @@ CONSTANT_BUFFER_14 = 22, CONSTANT_BUFFER_15 = 23, + CONSTANT_ADDRESS_W_RSRC = 42, // Some places use this if the address space can't be determined. UNKNOWN_ADDRESS_SPACE = ~0u }; Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -340,6 +340,7 @@ ATOMIC_DEC, BUFFER_LOAD, BUFFER_LOAD_FORMAT, + SBUFFER_LOAD, LAST_AMDGPU_ISD_NUMBER }; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3305,6 +3305,7 @@ NODE_NAME_CASE(ATOMIC_DEC) NODE_NAME_CASE(BUFFER_LOAD) NODE_NAME_CASE(BUFFER_LOAD_FORMAT) + NODE_NAME_CASE(SBUFFER_LOAD) case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } return nullptr; Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -164,7 +164,7 @@ // 32-bit private, local, and region pointers. 64-bit global, constant and // flat. - return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32" + return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p42:128:128" "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; } Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -102,6 +102,12 @@ } unsigned getVectorSplitCost() { return 0; } + + bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info); + + Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, + Type *ExpectedType); + }; } // end namespace llvm Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -84,6 +84,7 @@ switch (AddrSpace) { case AMDGPUAS::GLOBAL_ADDRESS: case AMDGPUAS::CONSTANT_ADDRESS: + case AMDGPUAS::CONSTANT_ADDRESS_W_RSRC: case AMDGPUAS::FLAT_ADDRESS: return 128; case AMDGPUAS::LOCAL_ADDRESS: @@ -341,3 +342,36 @@ return false; } + +bool AMDGPUTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, + MemIntrinsicInfo &Info) { + IRBuilder<> Builder(Inst); + switch (Inst->getIntrinsicID()) { + default: + return false; + case Intrinsic::amdgcn_s_buffer_load: + Info.ReadMem = true; + Info.WriteMem = false; + Info.IsSimple = true; + Info.NumMemRefs = 1; + + // We can only set this if the intrinsic is functionally equivalent to a + // load/store. + if (auto Offset = dyn_cast(Inst->getArgOperand(1))) { + if (Offset->isZero() && + static_cast(Inst->getArgOperand(2))->isZero()) { + Info.PtrVal = Inst->getArgOperand(0); + } + } + break; + } + return true; +} + +Value *AMDGPUTTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, + Type *ExpectedType) { + if (Inst->getType() == ExpectedType) + return Inst; + + return nullptr; +} Index: lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- lib/Target/AMDGPU/BUFInstructions.td +++ lib/Target/AMDGPU/BUFInstructions.td @@ -713,8 +713,8 @@ // Offset in an 32-bit VGPR def : Pat < - (SIload_constant v4i32:$sbase, i32:$voff), - (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, (i32 0), 0, 0, 0, 0) + (i32 (SIsbuffer_load v4i32:$sbase, i32:$offset, i1:$glc)), + (BUFFER_LOAD_DWORD_OFFEN $offset, $sbase, (i32 0), 0, (as_i1imm $glc), 0, 0) >; Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -176,6 +176,8 @@ EVT VT) const override; MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override; bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; + void LowerOperationWrapper(SDNode *N, SmallVectorImpl &Results, + SelectionDAG &DAG) const override; SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; void ReplaceNodeResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const override; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -34,6 +34,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" +#include "llvm/Analysis/Loads.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/DAGCombine.h" @@ -431,6 +432,15 @@ Info.readMem = true; Info.writeMem = true; return true; + case Intrinsic::amdgcn_s_buffer_load: + Info.opc = AMDGPUISD::SBUFFER_LOAD; + Info.memVT = MVT::getVT(CI.getType()); + Info.ptrVal = CI.getOperand(0); + Info.align = 0; + Info.vol = false; + Info.readMem = true; + Info.writeMem = false; + return true; default: return false; } @@ -508,6 +518,7 @@ return isLegalMUBUFAddressingMode(AM); + case AMDGPUAS::CONSTANT_ADDRESS_W_RSRC: case AMDGPUAS::CONSTANT_ADDRESS: // If the offset isn't a multiple of 4, it probably isn't going to be // correctly aligned. @@ -1939,6 +1950,37 @@ // Custom DAG Lowering Operations //===----------------------------------------------------------------------===// + +void SITargetLowering::LowerOperationWrapper(SDNode *N, + SmallVectorImpl &Results, + SelectionDAG &DAG) const { + + if (N->getOpcode() != AMDGPUISD::SBUFFER_LOAD) { + TargetLowering::LowerOperationWrapper(N, Results, DAG); + return; + } + + SDLoc DL(N); + MemSDNode *M = cast(N); + SDValue Ops[] = { + M->getOperand(0), // Chain + DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, M->getOperand(1)), // Ptr + M->getOperand(2), // Offset + DAG.getTargetConstant(cast( + M->getOperand(3))->getZExtValue(), DL, MVT::i1) // glc + }; + + auto MMO = M->getMemOperand(); + if (isDereferenceablePointer(MMO->getValue(), DAG.getDataLayout())) + MMO->setFlags(MachineMemOperand::MODereferenceable); + + SDValue LD = DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL, + M->getVTList(), Ops, M->getMemoryVT(), + M->getMemOperand()); + Results.push_back(LD); + Results.push_back(LD.getValue(1)); +} + SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); @@ -2643,8 +2685,10 @@ TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT); case AMDGPUIntrinsic::SI_load_const: { SDValue Ops[] = { - Op.getOperand(1), - Op.getOperand(2) + DAG.getEntryNode(), // Chain + Op.getOperand(1), // Ptr + Op.getOperand(2), // Offset + DAG.getTargetConstant(0, DL, MVT::i1) // glc }; MachineMemOperand *MMO = MF.getMachineMemOperand( @@ -2652,8 +2696,16 @@ MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant, VT.getStoreSize(), 4); - return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, - Op->getVTList(), Ops, VT, MMO); + SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other); + SDValue Load = DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL, + VTList, Ops, MVT::i32, MMO); + + SDValue MergeOps[] = { + DAG.getNode(ISD::BITCAST, DL, MVT::f32, Load), + Load.getValue(1) + }; + + return DAG.getMergeValues(MergeOps, DL); } case AMDGPUIntrinsic::amdgcn_fdiv_fast: return lowerFDIV_FAST(Op, DAG); Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -26,11 +26,6 @@ // SI DAG Nodes //===----------------------------------------------------------------------===// -def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT", - SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i32>]>, - [SDNPMayLoad, SDNPMemOperand] ->; - def SIatomic_inc : SDNode<"AMDGPUISD::ATOMIC_INC", SDTAtomic2, [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] >; @@ -97,6 +92,11 @@ defm atomic_inc_global : global_binary_atomic_op; defm atomic_dec_global : global_binary_atomic_op; +def SIsbuffer_load : SDNode<"AMDGPUISD::SBUFFER_LOAD", + SDTypeProfile<1, 3, [SDTCisVT<1, v4i32>, SDTCisVT<2, i32>, SDTCisVT<3, i1>]>, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand] +>; + //===----------------------------------------------------------------------===// // SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1 // to be glued to the memory instructions. Index: lib/Target/AMDGPU/SMInstructions.td =================================================================== --- lib/Target/AMDGPU/SMInstructions.td +++ lib/Target/AMDGPU/SMInstructions.td @@ -276,15 +276,16 @@ defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>; // 1. Offset as an immediate -def SM_LOAD_PATTERN : Pat < // name this pattern to reuse AddedComplexity on CI - (SIload_constant v4i32:$sbase, (SMRDBufferImm i32:$offset)), - (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset, 0) +// name this pattern to reuse AddedComplexity on CI +def SM_LOAD_PATTERN : Pat < + (i32 (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), i1:$glc)), + (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset, (as_i1imm $glc)) >; // 2. Offset loaded in an 32bit SGPR def : Pat < - (SIload_constant v4i32:$sbase, (SMRDBufferSgpr i32:$offset)), - (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset, 0) + (i32 (SIsbuffer_load v4i32:$sbase, (SMRDBufferSgpr i32:$offset), i1:$glc)), + (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset, (as_i1imm $glc)) >; } // End let AddedComplexity = 100 @@ -520,8 +521,8 @@ def : SMRD_Pattern_ci <"S_LOAD_DWORDX16", v16i32>; def : Pat < - (SIload_constant v4i32:$sbase, (SMRDBufferImm32 i32:$offset)), - (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset, 0)> { + (i32 (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), i1:$glc)), + (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset, $glc)> { let Predicates = [isCI]; // should this be isCIOnly? } Index: test/CodeGen/AMDGPU/mubuf.ll =================================================================== --- test/CodeGen/AMDGPU/mubuf.ll +++ test/CodeGen/AMDGPU/mubuf.ll @@ -84,6 +84,23 @@ ret void } +; Using the load.const intrinsic with an vgpr offset +; CHECK-LABEL: {{^}}s_buffer_load: +; CHECK: buffer_load_dword v{{[0-9+]}}, v{{[0-9+]}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen +; CHECK: buffer_load_dword v{{[0-9+]}}, v{{[0-9+]}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen +define amdgpu_ps void @s_buffer_load(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float, i32 addrspace(42)* addrspace(2)* inreg %in) { +main_body: + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 + %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 + %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 %tid) + %23 = load i32 addrspace(42)*, i32 addrspace(42)* addrspace(2)* %in + %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %23, i32 %tid, i1 false) + %s.buffer.float = bitcast i32 %s.buffer to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %s.buffer.float) + ret void +} + ;;;==========================================================================;;; ;;; MUBUF STORE TESTS ;;;==========================================================================;;; @@ -174,7 +191,12 @@ ret void } +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #0 declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) +declare float @llvm.SI.load.const(<16 x i8>, i32) #1 +declare i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* nocapture, i32, i1) +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) attributes #0 = { nounwind readonly } +attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/smrd.ll =================================================================== --- test/CodeGen/AMDGPU/smrd.ll +++ test/CodeGen/AMDGPU/smrd.ll @@ -87,13 +87,18 @@ ; SMRD load using the load.const intrinsic with an immediate offset ; GCN-LABEL: {{^}}smrd_load_const0: ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04 +; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04 +; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10 -define amdgpu_ps void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) { +define amdgpu_ps void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float, i32 addrspace(42)* addrspace(2)* inreg %in) { main_body: %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16) - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22) + %23 = load i32 addrspace(42)*, i32 addrspace(42)* addrspace(2)* %in + %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %23, i32 16, i1 false) + %s.buffer.float = bitcast i32 %s.buffer to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %s.buffer.float) ret void } @@ -101,13 +106,18 @@ ; offset. ; GCN-LABEL: {{^}}smrd_load_const1: ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff +; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff +; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc -define amdgpu_ps void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) { +define amdgpu_ps void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float, i32 addrspace(42)* addrspace(2)* inreg %in) { main_body: %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1020) - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22) + %23 = load i32 addrspace(42)*, i32 addrspace(42)* addrspace(2)* %in + %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %23, i32 1020, i1 false) + %s.buffer.float = bitcast i32 %s.buffer to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %s.buffer.float) ret void } ; SMRD load using the load.const intrinsic with an offset greater than the @@ -116,14 +126,20 @@ ; GCN-LABEL: {{^}}smrd_load_const2: ; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400 ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]] +; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]] +; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 -define amdgpu_ps void @smrd_load_const2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) { +; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 +define amdgpu_ps void @smrd_load_const2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float, i32 addrspace(42)* addrspace(2)* inreg %in) { main_body: %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1024) - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22) + %23 = load i32 addrspace(42)*, i32 addrspace(42)* addrspace(2)* %in + %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %23, i32 1024, i1 false) + %s.buffer.float = bitcast i32 %s.buffer to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %s.buffer.float) ret void } @@ -131,14 +147,20 @@ ; GCN-LABEL: {{^}}smrd_load_const3: ; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] +; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] +; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc -define amdgpu_ps void @smrd_load_const3(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) { +; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc +define amdgpu_ps void @smrd_load_const3(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float, i32 addrspace(42)* addrspace(2)* inreg %in) { main_body: %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1048572) - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22) + %23 = load i32 addrspace(42)*, i32 addrspace(42)* addrspace(2)* %in + %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %23, i32 1048572, i1 false) + %s.buffer.float = bitcast i32 %s.buffer to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %s.buffer.float) ret void } @@ -146,14 +168,19 @@ ; GCN-LABEL: {{^}}smrd_load_const4: ; SIVI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000 ; SIVI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] +; SIVI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] +; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 ; GCN: s_endpgm -define amdgpu_ps void @smrd_load_const4(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) { +define amdgpu_ps void @smrd_load_const4(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float, i32 addrspace(42)* addrspace(2)* inreg %in) { main_body: %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1048576) - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22) + %23 = load i32 addrspace(42)*, i32 addrspace(42)* addrspace(2)* %in + %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %23, i32 1048576, i1 false) + %s.buffer.float = bitcast i32 %s.buffer to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %s.buffer.float) ret void } @@ -162,4 +189,6 @@ declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) +declare i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* nocapture, i32, i1) + attributes #0 = { nounwind readnone } Index: test/Transforms/EarlyCSE/AMDGPU/intrinsics.ll =================================================================== --- /dev/null +++ test/Transforms/EarlyCSE/AMDGPU/intrinsics.ll @@ -0,0 +1,36 @@ +; RUN: opt < %s -S -mtriple=amdgcn-- -early-cse | FileCheck %s + +; CHECK-LABEL: @no_cse +; CHECK: call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %in, i32 0, i1 false) +; CHECK: call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %in, i32 4, i1 false) +define void @no_cse(i32 addrspace(1)* %out, i32 addrspace(42)* %in) { + %a = call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %in, i32 0, i1 false) + %b = call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %in, i32 4, i1 false) + %c = add i32 %a, %b + store i32 %c, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: @cse_zero_offset +; CHECK: [[CSE:%[a-z0-9A-Z]+]] = call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %in, i32 0, i1 false) +; CHECK: add i32 [[CSE]], [[CSE]] +define void @cse_zero_offset(i32 addrspace(1)* %out, i32 addrspace(42)* %in) { + %a = call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %in, i32 0, i1 false) + %b = call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %in, i32 0, i1 false) + %c = add i32 %a, %b + store i32 %c, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: @cse_nonzero_offset +; CHECK: [[CSE:%[a-z0-9A-Z]+]] = call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %in, i32 4, i1 false) +; CHECK: add i32 [[CSE]], [[CSE]] +define void @cse_nonzero_offset(i32 addrspace(1)* %out, i32 addrspace(42)* %in) { + %a = call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %in, i32 4, i1 false) + %b = call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %in, i32 4, i1 false) + %c = add i32 %a, %b + store i32 %c, i32 addrspace(1)* %out + ret void +} + +declare i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* nocapture, i32, i1)