Index: llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td +++ llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td @@ -802,6 +802,14 @@ def int_amdgcn_buffer_load_format : AMDGPUBufferLoad; def int_amdgcn_buffer_load : AMDGPUBufferLoad; +def int_amdgcn_s_buffer_load : Intrinsic < + [llvm_anyint_ty], + [llvm_v4i32_ty, // rsrc(SGPR) + llvm_i32_ty, // byte offset(SGPR/VGPR/imm) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc) + [IntrNoMem]>, + AMDGPURsrcIntrinsic<0>; + class AMDGPUBufferStore : Intrinsic < [], [llvm_anyfloat_ty, // vdata(VGPR) -- can currently only select f32, v2f32, v4f32 Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -486,6 +486,7 @@ BUFFER_LOAD, BUFFER_LOAD_FORMAT, BUFFER_LOAD_FORMAT_D16, + SBUFFER_LOAD, BUFFER_STORE, BUFFER_STORE_FORMAT, BUFFER_STORE_FORMAT_D16, Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4170,6 +4170,7 @@ NODE_NAME_CASE(BUFFER_LOAD) NODE_NAME_CASE(BUFFER_LOAD_FORMAT) NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16) + NODE_NAME_CASE(SBUFFER_LOAD) NODE_NAME_CASE(BUFFER_STORE) NODE_NAME_CASE(BUFFER_STORE_FORMAT) NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16) Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4921,8 +4921,9 @@ MFI->getArgInfo().WorkItemIDZ); case AMDGPUIntrinsic::SI_load_const: { SDValue Ops[] = { - Op.getOperand(1), - Op.getOperand(2) + Op.getOperand(1), // Ptr + Op.getOperand(2), // Offset + DAG.getTargetConstant(0, DL, MVT::i1) // glc }; MachineMemOperand *MMO = MF.getMachineMemOperand( @@ -4930,7 +4931,26 @@ MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant, VT.getStoreSize(), 4); - return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, + SDVTList VTList = DAG.getVTList(MVT::i32); + SDValue Load = DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL, + VTList, Ops, MVT::i32, MMO); + + return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Load); + } + case Intrinsic::amdgcn_s_buffer_load: { + unsigned Cache = cast(Op.getOperand(3))->getZExtValue(); + SDValue Ops[] = { + Op.getOperand(1), // Ptr + Op.getOperand(2), // Offset + DAG.getTargetConstant(Cache & 1, DL, MVT::i1) // glc + }; + + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + VT.getStoreSize(), VT.getStoreSize()); + return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL, Op->getVTList(), Ops, VT, MMO); } case Intrinsic::amdgcn_fdiv_fast: Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h @@ -101,6 +101,8 @@ MachineInstr &Inst) const; void splitScalar64BitBFE(SetVectorType &Worklist, MachineInstr &Inst) const; + void splitScalarBuffer(SetVectorType &Worklist, + MachineInstr &Inst) const; void movePackToVALU(SetVectorType &Worklist, MachineRegisterInfo &MRI, MachineInstr &Inst) const; Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3904,8 +3904,34 @@ Inst.eraseFromParent(); continue; - case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: { - unsigned VDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: { + unsigned VDst; + unsigned NewOpcode; + + switch(Opcode) { + case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: + NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_OFFEN; + VDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + break; + case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: + NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; + VDst = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + break; + case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: + NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN; + VDst = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass); + break; + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: + splitScalarBuffer(Worklist, Inst); + Inst.eraseFromParent(); + continue; + } + const MachineOperand *VAddr = getNamedOperand(Inst, AMDGPU::OpName::soff); auto Add = MRI.getUniqueVRegDef(VAddr->getReg()); unsigned Offset = 0; @@ -3956,7 +3982,7 @@ MachineInstr *NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), - get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), VDst) + get(NewOpcode), VDst) .add(*VAddr) // vaddr .add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc .addImm(0) // soffset @@ -4457,6 +4483,73 @@ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } +void SIInstrInfo::splitScalarBuffer(SetVectorType &Worklist, + MachineInstr &Inst) const { + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + MachineBasicBlock::iterator MII = Inst; + auto &DL = Inst.getDebugLoc(); + + MachineOperand &Dest = *getNamedOperand(Inst, AMDGPU::OpName::sdst);; + MachineOperand &Rsrc = *getNamedOperand(Inst, AMDGPU::OpName::sbase); + MachineOperand &Offset = *getNamedOperand(Inst, AMDGPU::OpName::soff); + MachineOperand &Glc = *getNamedOperand(Inst, AMDGPU::OpName::glc); + + unsigned Opcode = Inst.getOpcode(); + unsigned NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN; + unsigned Count = 0; + const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); + const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); + + switch(Opcode) { + default: + return; + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: + Count = 2; + break; + case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: + Count = 4; + break; + } + + // FIXME: Should also attempt to build VAddr and Offset like the non-split + // case (see call site for this function) + + // Create a vector of result registers + SmallVector ResultRegs; + for (unsigned i = 0; i < Count ; ++i) { + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass); + MachineInstr &NewMI = *BuildMI(MBB, MII, DL, get(NewOpcode), ResultReg) + .addReg(Offset.getReg()) // offset + .addReg(Rsrc.getReg()) // rsrc + .addImm(0) // soffset + .addImm(i << 4) // inst_offset + .addImm(Glc.getImm()) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addMemOperand(*Inst.memoperands_begin()); + // Extract the 4 32 bit sub-registers from the result to add into the final REG_SEQUENCE + auto &NewDestOp = NewMI.getOperand(0); + for (unsigned i = 0 ; i < 4 ; i++) + ResultRegs.push_back(buildExtractSubReg(MII, MRI, NewDestOp, &AMDGPU::VReg_128RegClass, + RI.getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass)); + } + // Create a new combined result to replace original with + unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); + MachineInstrBuilder CombinedResBuilder = BuildMI(MBB, MII, DL, + get(TargetOpcode::REG_SEQUENCE), FullDestReg); + + for (unsigned i = 0 ; i < Count * 4 ; ++i) { + CombinedResBuilder + .addReg(ResultRegs[i]) + .addImm(RI.getSubRegFromChannel(i)); + } + + MRI.replaceRegWith(Dest.getReg(), FullDestReg); + addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); +} + void SIInstrInfo::addUsersToMoveToVALUWorklist( unsigned DstReg, MachineRegisterInfo &MRI, Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td @@ -40,9 +40,9 @@ def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>; -def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT", - SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i32>]>, - [SDNPMayLoad, SDNPMemOperand] +def SIsbuffer_load : SDNode<"AMDGPUISD::SBUFFER_LOAD", + SDTypeProfile<1, 3, [SDTCisVT<1, v4i32>, SDTCisVT<2, i32>, SDTCisVT<3, i1>]>, + [SDNPMayLoad, SDNPMemOperand] >; def SIatomic_inc : SDNode<"AMDGPUISD::ATOMIC_INC", SDTAtomic2, Index: llvm/trunk/lib/Target/AMDGPU/SMInstructions.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SMInstructions.td +++ llvm/trunk/lib/Target/AMDGPU/SMInstructions.td @@ -409,6 +409,22 @@ >; } +multiclass SMLoad_Pattern { + // 1. Offset as an immediate + // name this pattern to reuse AddedComplexity on CI + def _IMM : GCNPat < + (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), i1:$glc), + (vt (!cast(Instr#"_IMM") $sbase, $offset, (as_i1imm $glc))) + >; + + // 2. Offset loaded in an 32bit SGPR + def : GCNPat < + (SIsbuffer_load v4i32:$sbase, i32:$offset, i1:$glc), + (vt (!cast(Instr#"_SGPR") $sbase, $offset, (as_i1imm $glc))) + >; +} + + let OtherPredicates = [isSICI] in { def : GCNPat < (i64 (readcyclecounter)), @@ -427,18 +443,12 @@ defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>; defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>; -// 1. Offset as an immediate -def SM_LOAD_PATTERN : GCNPat < // name this pattern to reuse AddedComplexity on CI - (SIload_constant v4i32:$sbase, (SMRDBufferImm i32:$offset)), - (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset, 0) ->; - -// 2. Offset loaded in an 32bit SGPR -def : GCNPat < - (SIload_constant v4i32:$sbase, i32:$offset), - (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset, 0) ->; - +// Name the pattern to reuse AddedComplexity on CI +defm SM_LOAD_PATTERN : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD", i32>; +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2", v2i32>; +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX4", v4i32>; +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8i32>; +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16i32>; } // End let AddedComplexity = 100 let OtherPredicates = [isVI] in { @@ -757,7 +767,7 @@ def S_DCACHE_INV_VOL_ci : SMRD_Real_ci <0x1d, S_DCACHE_INV_VOL>; -let AddedComplexity = SM_LOAD_PATTERN.AddedComplexity in { +let AddedComplexity = SM_LOAD_PATTERN_IMM.AddedComplexity in { class SMRD_Pattern_ci : GCNPat < (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)), @@ -771,11 +781,17 @@ def : SMRD_Pattern_ci <"S_LOAD_DWORDX8", v8i32>; def : SMRD_Pattern_ci <"S_LOAD_DWORDX16", v16i32>; -def : GCNPat < - (SIload_constant v4i32:$sbase, (SMRDBufferImm32 i32:$offset)), - (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset, 0)> { +class SMLoad_Pattern_ci : GCNPat < + (vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), i1:$glc)), + (!cast(Instr) $sbase, $offset, (as_i1imm $glc))> { let OtherPredicates = [isCI]; // should this be isCIOnly? } +def : SMLoad_Pattern_ci <"S_BUFFER_LOAD_DWORD_IMM_ci", i32>; +def : SMLoad_Pattern_ci <"S_BUFFER_LOAD_DWORDX2_IMM_ci", v2i32>; +def : SMLoad_Pattern_ci <"S_BUFFER_LOAD_DWORDX4_IMM_ci", v4i32>; +def : SMLoad_Pattern_ci <"S_BUFFER_LOAD_DWORDX8_IMM_ci", v8i32>; +def : SMLoad_Pattern_ci <"S_BUFFER_LOAD_DWORDX16_IMM_ci", v16i32>; + } // End let AddedComplexity = SM_LOAD_PATTERN.AddedComplexity Index: llvm/trunk/test/CodeGen/AMDGPU/smrd.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/smrd.ll +++ llvm/trunk/test/CodeGen/AMDGPU/smrd.ll @@ -106,7 +106,7 @@ ; GCN-LABEL: {{^}}smrd_load_const0: ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04 ; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10 -define amdgpu_ps void @smrd_load_const0(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { +define amdgpu_ps void @smrd_load_const0(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 { main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp @@ -119,13 +119,18 @@ ; offset. ; GCN-LABEL: {{^}}smrd_load_const1: ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff +; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff glc ; encoding: [0xff ; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc -define amdgpu_ps void @smrd_load_const1(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { +; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc glc +define amdgpu_ps void @smrd_load_const1(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 { main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1020) - call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0 + %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in + %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1020, i32 1) + %s.buffer.float = bitcast i32 %s.buffer to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0 ret void } @@ -135,14 +140,20 @@ ; GCN-LABEL: {{^}}smrd_load_const2: ; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400 ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]] +; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]] +; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 ; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 -define amdgpu_ps void @smrd_load_const2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { +; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 +define amdgpu_ps void @smrd_load_const2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 { main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1024) - call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0 + %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in + %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1024, i32 0) + %s.buffer.float = bitcast i32 %s.buffer to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0 ret void } @@ -150,14 +161,20 @@ ; GCN-LABEL: {{^}}smrd_load_const3: ; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] +; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] +; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff ; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc -define amdgpu_ps void @smrd_load_const3(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { +; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc +define amdgpu_ps void @smrd_load_const3(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 { main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1048572) - call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0 + %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in + %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1048572, i32 0) + %s.buffer.float = bitcast i32 %s.buffer to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0 ret void } @@ -165,14 +182,95 @@ ; GCN-LABEL: {{^}}smrd_load_const4: ; SIVIGFX9: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000 ; SIVIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] +; SIVIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] +; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 ; GCN: s_endpgm -define amdgpu_ps void @smrd_load_const4(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { +define amdgpu_ps void @smrd_load_const4(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 { main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1048576) - call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0 + %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in + %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1048576, i32 0) + %s.buffer.float = bitcast i32 %s.buffer to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0 + ret void +} + +; dwordx2 s.buffer.load +; GCN-LABEL: {{^}}s_buffer_load_dwordx2: +; VIGFX9: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x80 +; SICI: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20 +define amdgpu_ps void @s_buffer_load_dwordx2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 { +main_body: + %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in + %s.buffer = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %tmp22, i32 128, i32 0) + %s.buffer.0 = extractelement <2 x i32> %s.buffer, i32 0 + %s.buffer.0.float = bitcast i32 %s.buffer.0 to float + %s.buffer.1 = extractelement <2 x i32> %s.buffer, i32 1 + %s.buffer.1.float = bitcast i32 %s.buffer.1 to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %s.buffer.0.float, float %s.buffer.1.float, float %s.buffer.0.float, float %s.buffer.1.float, i1 true, i1 true) #0 + ret void +} + +; dwordx4 s.buffer.load +; GCN-LABEL: {{^}}s_buffer_load_dwordx4: +; VIGFX9: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x80 +; SICI: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20 +define amdgpu_ps void @s_buffer_load_dwordx4(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 { +main_body: + %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in + %s.buffer = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %tmp22, i32 128, i32 0) + %s.buffer.0 = extractelement <4 x i32> %s.buffer, i32 0 + %s.buffer.0.float = bitcast i32 %s.buffer.0 to float + %s.buffer.1 = extractelement <4 x i32> %s.buffer, i32 1 + %s.buffer.1.float = bitcast i32 %s.buffer.1 to float + %s.buffer.2 = extractelement <4 x i32> %s.buffer, i32 2 + %s.buffer.2.float = bitcast i32 %s.buffer.2 to float + %s.buffer.3 = extractelement <4 x i32> %s.buffer, i32 3 + %s.buffer.3.float = bitcast i32 %s.buffer.3 to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %s.buffer.0.float, float %s.buffer.1.float, float %s.buffer.2.float, float %s.buffer.3.float, i1 true, i1 true) #0 + ret void +} + +; dwordx8 s.buffer.load +; GCN-LABEL: {{^}}s_buffer_load_dwordx8: +; VIGFX9: s_buffer_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x80 +; SICI: s_buffer_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20 +define amdgpu_ps void @s_buffer_load_dwordx8(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 { +main_body: + %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in + %s.buffer = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %tmp22, i32 128, i32 0) + %s.buffer.0 = extractelement <8 x i32> %s.buffer, i32 0 + %s.buffer.0.float = bitcast i32 %s.buffer.0 to float + %s.buffer.1 = extractelement <8 x i32> %s.buffer, i32 2 + %s.buffer.1.float = bitcast i32 %s.buffer.1 to float + %s.buffer.2 = extractelement <8 x i32> %s.buffer, i32 5 + %s.buffer.2.float = bitcast i32 %s.buffer.2 to float + %s.buffer.3 = extractelement <8 x i32> %s.buffer, i32 7 + %s.buffer.3.float = bitcast i32 %s.buffer.3 to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %s.buffer.0.float, float %s.buffer.1.float, float %s.buffer.2.float, float %s.buffer.3.float, i1 true, i1 true) #0 + ret void +} + +; dwordx16 s.buffer.load +; GCN-LABEL: {{^}}s_buffer_load_dwordx16: +; VIGFX9: s_buffer_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x80 +; SICI: s_buffer_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20 +define amdgpu_ps void @s_buffer_load_dwordx16(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 { +main_body: + %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in + %s.buffer = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %tmp22, i32 128, i32 0) + %s.buffer.0 = extractelement <16 x i32> %s.buffer, i32 0 + %s.buffer.0.float = bitcast i32 %s.buffer.0 to float + %s.buffer.1 = extractelement <16 x i32> %s.buffer, i32 3 + %s.buffer.1.float = bitcast i32 %s.buffer.1 to float + %s.buffer.2 = extractelement <16 x i32> %s.buffer, i32 12 + %s.buffer.2.float = bitcast i32 %s.buffer.2 to float + %s.buffer.3 = extractelement <16 x i32> %s.buffer, i32 15 + %s.buffer.3.float = bitcast i32 %s.buffer.3 to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %s.buffer.0.float, float %s.buffer.1.float, float %s.buffer.2.float, float %s.buffer.3.float, i1 true, i1 true) #0 ret void } @@ -339,10 +437,90 @@ br i1 %outer_br, label %.outer_loop_header, label %ret_block } +; SMRD load with a non-const offset +; GCN-LABEL: {{^}}smrd_load_nonconst0: +; SIVIGFX9: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} +; SIVIGFX9: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} +; CI: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} +; CI: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} +; GCN: s_endpgm +define amdgpu_ps void @smrd_load_nonconst0(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in, i32 inreg %ncoff) #0 { +main_body: + %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 + %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp + %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 %ncoff) + %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in + %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 %ncoff, i32 0) + %s.buffer.float = bitcast i32 %s.buffer to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0 + ret void +} + +; SMRD load with a non-const non-uniform offset +; GCN-LABEL: {{^}}smrd_load_nonconst1: +; SIVIGFX9: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen +; SIVIGFX9: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen +; CI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen +; CI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen +; GCN: s_endpgm +define amdgpu_ps void @smrd_load_nonconst1(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in, i32 %ncoff) #0 { +main_body: + %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 + %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp + %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 %ncoff) + %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in + %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 %ncoff, i32 0) + %s.buffer.float = bitcast i32 %s.buffer to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0 + ret void +} + +; SMRD load with a non-const non-uniform offset of > 4 dwords (requires splitting) +; GCN-LABEL: {{^}}smrd_load_nonconst2: +; SIVIGFX9: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen +; SIVIGFX9: buffer_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen +; CI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen +; CI: buffer_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen +; GCN: s_endpgm +define amdgpu_ps void @smrd_load_nonconst2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in, i32 %ncoff) #0 { +main_body: + %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 + %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp + %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 %ncoff) + %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in + %s.buffer = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %tmp22, i32 %ncoff, i32 0) + %s.buffer.elt = extractelement <8 x i32> %s.buffer, i32 1 + %s.buffer.float = bitcast i32 %s.buffer.elt to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0 + ret void +} + +; SMRD load dwordx2 +; GCN-LABEL: {{^}}smrd_load_dwordx2: +; SIVIGFX9: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} +; CI: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} +; GCN: s_endpgm +define amdgpu_ps void @smrd_load_dwordx2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in, i32 inreg %ncoff) #0 { +main_body: + %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in + %s.buffer = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %tmp22, i32 %ncoff, i32 0) + %s.buffer.float = bitcast <2 x i32> %s.buffer to <2 x float> + %r.1 = extractelement <2 x float> %s.buffer.float, i32 0 + %r.2 = extractelement <2 x float> %s.buffer.float, i32 1 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r.1, float %r.1, float %r.1, float %r.2, i1 true, i1 true) #0 + ret void +} + + declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1 declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2 +declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32) +declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32) +declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32) +declare <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32>, i32, i32) +declare <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32>, i32, i32) attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: llvm/trunk/test/Transforms/EarlyCSE/intrinsics.ll =================================================================== --- llvm/trunk/test/Transforms/EarlyCSE/intrinsics.ll +++ llvm/trunk/test/Transforms/EarlyCSE/intrinsics.ll @@ -0,0 +1,36 @@ +; RUN: opt < %s -S -mtriple=amdgcn-- -early-cse | FileCheck %s + +; CHECK-LABEL: @no_cse +; CHECK: call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 0, i32 0) +; CHECK: call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 4, i32 0) +define void @no_cse(i32 addrspace(1)* %out, <4 x i32> %in) { + %a = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 0, i32 0) + %b = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 4, i32 0) + %c = add i32 %a, %b + store i32 %c, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: @cse_zero_offset +; CHECK: [[CSE:%[a-z0-9A-Z]+]] = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 0, i32 0) +; CHECK: add i32 [[CSE]], [[CSE]] +define void @cse_zero_offset(i32 addrspace(1)* %out, <4 x i32> %in) { + %a = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 0, i32 0) + %b = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 0, i32 0) + %c = add i32 %a, %b + store i32 %c, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: @cse_nonzero_offset +; CHECK: [[CSE:%[a-z0-9A-Z]+]] = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 4, i32 0) +; CHECK: add i32 [[CSE]], [[CSE]] +define void @cse_nonzero_offset(i32 addrspace(1)* %out, <4 x i32> %in) { + %a = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 4, i32 0) + %b = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %in, i32 4, i32 0) + %c = add i32 %a, %b + store i32 %c, i32 addrspace(1)* %out + ret void +} + +declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> nocapture, i32, i32)