Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -819,7 +819,7 @@ defset list AMDGPUBufferIntrinsics = { class AMDGPUBufferLoad : Intrinsic < - [llvm_anyfloat_ty], + [llvm_any_ty], [llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(SGPR/VGPR/imm) @@ -840,7 +840,7 @@ class AMDGPUBufferStore : Intrinsic < [], - [llvm_anyfloat_ty, // vdata(VGPR) -- can currently only select f32, v2f32, v4f32 + [llvm_any_ty, // vdata(VGPR) llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(SGPR/VGPR/imm) Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -483,10 +483,16 @@ ATOMIC_LOAD_FMIN, ATOMIC_LOAD_FMAX, BUFFER_LOAD, + BUFFER_LOAD_UBYTE, + BUFFER_LOAD_USHORT, + BUFFER_LOAD_BYTE, + BUFFER_LOAD_SHORT, BUFFER_LOAD_FORMAT, BUFFER_LOAD_FORMAT_D16, SBUFFER_LOAD, BUFFER_STORE, + BUFFER_STORE_BYTE, + BUFFER_STORE_SHORT, BUFFER_STORE_FORMAT, BUFFER_STORE_FORMAT_D16, BUFFER_ATOMIC_SWAP, Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4200,10 +4200,16 @@ NODE_NAME_CASE(ATOMIC_LOAD_FMIN) NODE_NAME_CASE(ATOMIC_LOAD_FMAX) NODE_NAME_CASE(BUFFER_LOAD) + NODE_NAME_CASE(BUFFER_LOAD_UBYTE) + NODE_NAME_CASE(BUFFER_LOAD_USHORT) + NODE_NAME_CASE(BUFFER_LOAD_BYTE) + NODE_NAME_CASE(BUFFER_LOAD_SHORT) NODE_NAME_CASE(BUFFER_LOAD_FORMAT) NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16) NODE_NAME_CASE(SBUFFER_LOAD) NODE_NAME_CASE(BUFFER_STORE) + NODE_NAME_CASE(BUFFER_STORE_BYTE) + NODE_NAME_CASE(BUFFER_STORE_SHORT) NODE_NAME_CASE(BUFFER_STORE_FORMAT) NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16) NODE_NAME_CASE(BUFFER_ATOMIC_SWAP) @@ -4368,6 +4374,14 @@ } break; } + case AMDGPUISD::BUFFER_LOAD_UBYTE: { + Known.Zero.setHighBits(24); + break; + } + case AMDGPUISD::BUFFER_LOAD_USHORT: { + Known.Zero.setHighBits(16); + break; + } case ISD::INTRINSIC_WO_CHAIN: { unsigned IID = cast(Op.getOperand(0))->getZExtValue(); switch (IID) { @@ -4413,6 +4427,14 @@ case AMDGPUISD::CARRY: case AMDGPUISD::BORROW: return 31; + case AMDGPUISD::BUFFER_LOAD_BYTE: + return 25; + case AMDGPUISD::BUFFER_LOAD_SHORT: + return 17; + case AMDGPUISD::BUFFER_LOAD_UBYTE: + return 24; + case AMDGPUISD::BUFFER_LOAD_USHORT: + return 16; case AMDGPUISD::FP_TO_FP16: case AMDGPUISD::FP16_ZEXT: return 16; Index: lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- lib/Target/AMDGPU/BUFInstructions.td +++ lib/Target/AMDGPU/BUFInstructions.td @@ -1132,6 +1132,10 @@ defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; multiclass MUBUF_StoreIntrinsicPat { @@ -1196,6 +1200,8 @@ defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; //===----------------------------------------------------------------------===// // buffer_atomic patterns Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -140,6 +140,7 @@ SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performXorCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performZeroExtendCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performSignExtendInRegCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue getCanonicalConstantFP(SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const; @@ -192,6 +193,15 @@ void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, SDValue *Offsets, unsigned Align = 4) const; + // Handle 8 bit and 16 bit buffer loads + SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL, + ArrayRef Ops, MemSDNode *M) const; + + // Handle 8 bit and 16 bit buffer stores + SDValue handleByteShortBufferStores(SelectionDAG &DAG, EVT VDataType, + SDLoc DL, SDValue Ops[], + MemSDNode *M) const; + public: SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI); Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -216,11 +216,15 @@ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::BR_CC, MVT::i1, Expand); @@ -677,6 +681,7 @@ setTargetDAGCombine(ISD::FCANONICALIZE); setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); @@ -5592,6 +5597,12 @@ if (LoadVT.getScalarType() == MVT::f16) return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops); + + // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics + if (LoadVT.getScalarType() == MVT::i8 || + LoadVT.getScalarType() == MVT::i16) + return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, M->getMemOperand()); } @@ -5620,6 +5631,12 @@ if (LoadVT.getScalarType() == MVT::f16) return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops); + + // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics + if (LoadVT.getScalarType() == MVT::i8 || + LoadVT.getScalarType() == MVT::i16) + return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, M->getMemOperand()); } @@ -5648,6 +5665,12 @@ if (LoadVT.getScalarType() == MVT::f16) return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops); + + // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics + if (LoadVT.getScalarType() == MVT::i8 || + LoadVT.getScalarType() == MVT::i16) + return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, M->getMemOperand()); } @@ -6218,6 +6241,12 @@ AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast(Op); + + // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics + EVT VDataType = VData.getValueType().getScalarType(); + if (VDataType == MVT::i8 || VDataType == MVT::i16) + return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } @@ -6244,6 +6273,12 @@ AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast(Op); + + // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics + EVT VDataType = VData.getValueType().getScalarType(); + if (VDataType == MVT::i8 || VDataType == MVT::i16) + return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } @@ -6270,6 +6305,12 @@ AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast(Op); + + // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics + EVT VDataType = VData.getValueType().getScalarType(); + if (VDataType == MVT::i8 || VDataType == MVT::i16) + return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } @@ -6372,6 +6413,40 @@ Offsets[2] = DAG.getConstant(0, DL, MVT::i32); } +// Handle 8 bit and 16 bit buffer loads +SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, + EVT LoadVT, SDLoc DL, + ArrayRef Ops, + MemSDNode *M) const +{ + EVT IntVT = LoadVT.changeTypeToInteger(); + unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ? + AMDGPUISD::BUFFER_LOAD_UBYTE : AMDGPUISD::BUFFER_LOAD_USHORT; + + SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other); + SDValue BufferLoad = DAG.getMemIntrinsicNode(Opc, DL, ResList, + Ops, IntVT, + M->getMemOperand()); + SDValue BufferLoadTrunc = DAG.getNode(ISD::TRUNCATE, DL, + LoadVT.getScalarType(), BufferLoad); + return DAG.getMergeValues({BufferLoadTrunc, BufferLoad.getValue(1)}, DL); +} + +// Handle 8 bit and 16 bit buffer stores +SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG, + EVT VDataType, SDLoc DL, + SDValue Ops[], + MemSDNode *M) const +{ + SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]); + Ops[1] = BufferStoreExt; + unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE : + AMDGPUISD::BUFFER_STORE_SHORT; + ArrayRef OpsRef = makeArrayRef(&Ops[0], 9); + return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType, + M->getMemOperand()); +} + static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT) { @@ -7703,6 +7778,44 @@ return SDValue(); } +SDValue SITargetLowering::performSignExtendInRegCombine(SDNode *N, + DAGCombinerInfo &DCI) + const { + + SDValue Src = N->getOperand(0); + auto *VTSign = cast(N->getOperand(1)); + + if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE && + VTSign->getVT() == MVT::i8) || + (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT && + VTSign->getVT() == MVT::i16)) && + Src.hasOneUse()) { + auto *M = cast(Src); + SDValue Ops[] = { + Src.getOperand(0), // Chain + Src.getOperand(1), // rsrc + Src.getOperand(2), // vindex + Src.getOperand(3), // voffset + Src.getOperand(4), // soffset + Src.getOperand(5), // offset + Src.getOperand(6), + Src.getOperand(7) + }; + // replace with BUFFER_LOAD_BYTE/SHORT + SDVTList ResList = DCI.DAG.getVTList(MVT::i32, + Src.getOperand(0).getValueType()); + unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ? + AMDGPUISD::BUFFER_LOAD_BYTE : AMDGPUISD::BUFFER_LOAD_SHORT; + SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc, SDLoc(N), + ResList, + Ops, M->getMemoryVT(), + M->getMemOperand()); + return DCI.DAG.getMergeValues({BufferLoadSignExt, + BufferLoadSignExt.getValue(1)}, SDLoc(N)); + } + return SDValue(); +} + SDValue SITargetLowering::performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -8951,7 +9064,6 @@ DAGCombinerInfo &DCI) const { if (getTargetMachine().getOptLevel() == CodeGenOpt::None) return SDValue(); - switch (N->getOpcode()) { default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); @@ -9018,6 +9130,8 @@ return performXorCombine(N, DCI); case ISD::ZERO_EXTEND: return performZeroExtendCombine(N, DCI); + case ISD::SIGN_EXTEND_INREG: + return performSignExtendInRegCombine(N , DCI); case AMDGPUISD::FP_CLASS: return performClassCombine(N, DCI); case ISD::FCANONICALIZE: Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -120,6 +120,14 @@ def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SIbuffer_load_ubyte : SDNode <"AMDGPUISD::BUFFER_LOAD_UBYTE", SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SIbuffer_load_ushort : SDNode <"AMDGPUISD::BUFFER_LOAD_USHORT", SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SIbuffer_load_byte : SDNode <"AMDGPUISD::BUFFER_LOAD_BYTE", SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SIbuffer_load_short: SDNode <"AMDGPUISD::BUFFER_LOAD_SHORT", SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; def SIbuffer_load_format_d16 : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT_D16", @@ -138,6 +146,12 @@ def SIbuffer_store : SDNode <"AMDGPUISD::BUFFER_STORE", SDTBufferStore, [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; +def SIbuffer_store_byte: SDNode <"AMDGPUISD::BUFFER_STORE_BYTE", + SDTBufferStore, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; +def SIbuffer_store_short : SDNode <"AMDGPUISD::BUFFER_STORE_SHORT", + SDTBufferStore, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; def SIbuffer_store_format : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT", SDTBufferStore, [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll @@ -257,9 +257,194 @@ ret void } +;CHECK-LABEL: {{^}}buffer_load_ubyte: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_ubyte(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %val = uitofp i8 %tmp to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_ushort: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, off, s[0:3], 0 offset:16 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_ushort(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0) + %tmp2 = zext i16 %tmp to i32 + %val = uitofp i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_sbyte: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_sbyte(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp2 = sext i8 %tmp to i32 + %val = sitofp i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_sshort: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, off, s[0:3], 0 offset:16 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_sshort(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0) + %tmp2 = sext i16 %tmp to i32 + %val = sitofp i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_ubyte_bitcast: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_ubyte_bitcast(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp2 = zext i8 %tmp to i32 + %val = bitcast i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_ushort_bitcast: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_ushort_bitcast(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp2 = zext i16 %tmp to i32 + %val = bitcast i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_sbyte_bitcast: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_sbyte_bitcast(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp2 = sext i8 %tmp to i32 + %val = bitcast i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_sshort_bitcast: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_sshort_bitcast(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp2 = sext i16 %tmp to i32 + %val = bitcast i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_ubyte_mul_bitcast: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_mul_u32_u24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}} +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_ubyte_mul_bitcast(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp2 = zext i8 %tmp to i32 + %tmp3 = mul i32 %tmp2, 255 + %val = bitcast i32 %tmp3 to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_ushort_mul_bitcast: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_mul_u32_u24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}} +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_ushort_mul_bitcast(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp2 = zext i16 %tmp to i32 + %tmp3 = mul i32 %tmp2, 255 + %val = bitcast i32 %tmp3 to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_sbyte_mul_bitcast: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_mul_i32_i24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}} +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_sbyte_mul_bitcast(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp2 = sext i8 %tmp to i32 + %tmp3 = mul i32 %tmp2, 255 + %val = bitcast i32 %tmp3 to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_sshort_mul_bitcast: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_mul_i32_i24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}} +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_sshort_mul_bitcast(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp2 = sext i16 %tmp to i32 + %tmp3 = mul i32 %tmp2, 255 + %val = bitcast i32 %tmp3 to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_sbyte_type_check: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_bfe_i32 v{{[0-9]}}, v{{[0-9]}}, 0, 5 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_sbyte_type_check(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp2 = zext i8 %tmp to i32 + %tmp3 = shl i32 %tmp2, 27 + %tmp4 = ashr i32 %tmp3, 27 + %val = bitcast i32 %tmp4 to float + ret float %val +} + declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0 declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0 declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0 +declare i8 @llvm.amdgcn.buffer.load.i8(<4 x i32>, i32, i32, i1, i1) #0 +declare i16 @llvm.amdgcn.buffer.load.i16(<4 x i32>, i32, i32, i1, i1) #0 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 attributes #0 = { nounwind readonly } Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll @@ -233,9 +233,35 @@ ret void } +;CHECK-LABEL: {{^}}buffer_store_byte: +;CHECK-NOT: s_waitcnt +;CHECK-NEXT: %bb. +;CHECK: buffer_store_byte v{{[0-9]}}, off, s[0:3], 0 offset:8 +define amdgpu_ps void @buffer_store_byte(<4 x i32> inreg %rsrc, float %v1) { +main_body: + %v2 = fptoui float %v1 to i32 + %v3 = trunc i32 %v2 to i8 + call void @llvm.amdgcn.buffer.store.i8(i8 %v3, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_short: +;CHECK-NOT: s_waitcnt +;CHECK-NEXT: %bb. +;CHECK: buffer_store_short v{{[0-9]}}, off, s[0:3], 0 offset:16 +define amdgpu_ps void @buffer_store_short(<4 x i32> inreg %rsrc, float %v1) { +main_body: + %v2 = fptoui float %v1 to i32 + %v3 = trunc i32 %v2 to i16 + call void @llvm.amdgcn.buffer.store.i16(i16 %v3, <4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0) + ret void +} + declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #0 declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0 declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.buffer.store.i8(i8, <4 x i32>, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.buffer.store.i16(i16, <4 x i32>, i32, i32, i1, i1) #0 declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #1 attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll @@ -263,6 +263,62 @@ ret {<4 x float>, <2 x float>, float} %r2 } +;CHECK-LABEL: {{^}}raw_buffer_load_ubyte: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0 +;CHECK: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @raw_buffer_load_ubyte(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %tmp2 = zext i8 %tmp to i32 + %val = uitofp i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}raw_buffer_load_ushort: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, off, s[0:3], 0 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @raw_buffer_load_ushort(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %tmp2 = zext i16 %tmp to i32 + %val = uitofp i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}raw_buffer_load_sbyte: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, off, s[0:3], 0 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @raw_buffer_load_sbyte(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %tmp2 = sext i8 %tmp to i32 + %val = sitofp i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}raw_buffer_load_sshort: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, off, s[0:3], 0 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @raw_buffer_load_sshort(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %tmp2 = sext i16 %tmp to i32 + %val = sitofp i32 %tmp2 to float + ret float %val +} + declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #0 declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32) #0 declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0 @@ -270,5 +326,7 @@ declare <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32>, i32, i32, i32) #0 declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32) #0 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32>, i32, i32, i32) #0 +declare i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32>, i32, i32, i32) #0 attributes #0 = { nounwind readonly } Index: test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll @@ -189,6 +189,32 @@ ret void } +;CHECK-LABEL: {{^}}raw_buffer_store_byte: +;CHECK-NEXT: %bb. +;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} +;CHECK-NEXT: buffer_store_byte v{{[0-9]}}, off, s[0:3], 0 +;CHECK-NEXT: s_endpgm +define amdgpu_ps void @raw_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1) { +main_body: + %v2 = fptoui float %v1 to i32 + %v3 = trunc i32 %v2 to i8 + call void @llvm.amdgcn.raw.buffer.store.i8(i8 %v3, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}raw_buffer_store_short: +;CHECK-NEXT: %bb. +;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} +;CHECK-NEXT: buffer_store_short v{{[0-9]}}, off, s[0:3], 0 +;CHECK-NEXT: s_endpgm +define amdgpu_ps void @raw_buffer_store_short(<4 x i32> inreg %rsrc, float %v1) { +main_body: + %v2 = fptoui float %v1 to i32 + %v3 = trunc i32 %v2 to i16 + call void @llvm.amdgcn.raw.buffer.store.i16(i16 %v3, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32) #0 declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32) #0 declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #0 @@ -196,6 +222,8 @@ declare void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32) #0 declare void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32) #0 declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #1 +declare void @llvm.amdgcn.raw.buffer.store.i8(i8, <4 x i32>, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.i16(i16, <4 x i32>, i32, i32, i32) #0 attributes #0 = { nounwind } attributes #1 = { nounwind readonly } Index: test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll @@ -144,6 +144,62 @@ ret {<4 x float>, <2 x float>, float} %r2 } +;CHECK-LABEL: {{^}}struct_buffer_load_ubyte: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, v[0:1], s[0:3], 0 idxen offen +;CHECK: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @struct_buffer_load_ubyte(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +main_body: + %tmp = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) + %tmp2 = zext i8 %tmp to i32 + %val = uitofp i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}struct_buffer_load_ushort: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, v[0:1], s[0:3], 0 idxen offen +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @struct_buffer_load_ushort(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +main_body: + %tmp = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) + %tmp2 = zext i16 %tmp to i32 + %val = uitofp i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}struct_buffer_load_sbyte: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, v[0:1], s[0:3], 0 idxen offen +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @struct_buffer_load_sbyte(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +main_body: + %tmp = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) + %tmp2 = sext i8 %tmp to i32 + %val = sitofp i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}struct_buffer_load_sshort: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, v[0:1], s[0:3], 0 idxen offen +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @struct_buffer_load_sshort(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +main_body: + %tmp = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) + %tmp2 = sext i16 %tmp to i32 + %val = sitofp i32 %tmp2 to float + ret float %val +} + declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #0 declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) #0 declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) #0 @@ -151,5 +207,7 @@ declare <2 x i32> @llvm.amdgcn.struct.buffer.load.v2i32(<4 x i32>, i32, i32, i32, i32) #0 declare <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) #0 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32>, i32, i32, i32, i32) #0 +declare i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32>, i32, i32, i32, i32) #0 attributes #0 = { nounwind readonly } Index: test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll @@ -108,6 +108,32 @@ ret void } +;CHECK-LABEL: {{^}}struct_buffer_store_byte: +;CHECK-NEXT: %bb. +;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} +;CHECK-NEXT: buffer_store_byte v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen +;CHECK-NEXT: s_endpgm +define amdgpu_ps void @struct_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1, i32 %index) { +main_body: + %v2 = fptoui float %v1 to i32 + %v3 = trunc i32 %v2 to i8 + call void @llvm.amdgcn.struct.buffer.store.i8(i8 %v3, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}struct_buffer_store_short: +;CHECK-NEXT: %bb. +;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} +;CHECK-NEXT: buffer_store_short v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen +;CHECK-NEXT: s_endpgm +define amdgpu_ps void @struct_buffer_store_short(<4 x i32> inreg %rsrc, float %v1, i32 %index) { +main_body: + %v2 = fptoui float %v1 to i32 + %v3 = trunc i32 %v2 to i16 + call void @llvm.amdgcn.struct.buffer.store.i16(i16 %v3, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + ret void +} + declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32) #0 declare void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32, i32) #0 declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #0 @@ -115,6 +141,8 @@ declare void @llvm.amdgcn.struct.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32) #0 declare void @llvm.amdgcn.struct.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32) #0 declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) #1 +declare void @llvm.amdgcn.struct.buffer.store.i8(i8, <4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.i16(i16, <4 x i32>, i32, i32, i32, i32) #0 attributes #0 = { nounwind } attributes #1 = { nounwind readonly }