Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -819,7 +819,7 @@ defset list AMDGPUBufferIntrinsics = { class AMDGPUBufferLoad : Intrinsic < - [llvm_anyfloat_ty], + [llvm_any_ty], [llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(SGPR/VGPR/imm) @@ -840,7 +840,7 @@ class AMDGPUBufferStore : Intrinsic < [], - [llvm_anyfloat_ty, // vdata(VGPR) -- can currently only select f32, v2f32, v4f32 + [llvm_any_ty, // vdata(VGPR) llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(SGPR/VGPR/imm) Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -483,10 +483,16 @@ ATOMIC_LOAD_FMIN, ATOMIC_LOAD_FMAX, BUFFER_LOAD, + BUFFER_LOAD_UBYTE, + BUFFER_LOAD_USHORT, + BUFFER_LOAD_BYTE, + BUFFER_LOAD_SHORT, BUFFER_LOAD_FORMAT, BUFFER_LOAD_FORMAT_D16, SBUFFER_LOAD, BUFFER_STORE, + BUFFER_STORE_BYTE, + BUFFER_STORE_SHORT, BUFFER_STORE_FORMAT, BUFFER_STORE_FORMAT_D16, BUFFER_ATOMIC_SWAP, Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4201,10 +4201,16 @@ NODE_NAME_CASE(ATOMIC_LOAD_FMIN) NODE_NAME_CASE(ATOMIC_LOAD_FMAX) NODE_NAME_CASE(BUFFER_LOAD) + NODE_NAME_CASE(BUFFER_LOAD_UBYTE) + NODE_NAME_CASE(BUFFER_LOAD_USHORT) + NODE_NAME_CASE(BUFFER_LOAD_BYTE) + NODE_NAME_CASE(BUFFER_LOAD_SHORT) NODE_NAME_CASE(BUFFER_LOAD_FORMAT) NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16) NODE_NAME_CASE(SBUFFER_LOAD) NODE_NAME_CASE(BUFFER_STORE) + NODE_NAME_CASE(BUFFER_STORE_BYTE) + NODE_NAME_CASE(BUFFER_STORE_SHORT) NODE_NAME_CASE(BUFFER_STORE_FORMAT) NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16) NODE_NAME_CASE(BUFFER_ATOMIC_SWAP) @@ -4369,6 +4375,14 @@ } break; } + case AMDGPUISD::BUFFER_LOAD_UBYTE: { + Known.Zero.setHighBits(24); + break; + } + case AMDGPUISD::BUFFER_LOAD_USHORT: { + Known.Zero.setHighBits(16); + break; + } case ISD::INTRINSIC_WO_CHAIN: { unsigned IID = cast(Op.getOperand(0))->getZExtValue(); switch (IID) { @@ -4414,6 +4428,14 @@ case AMDGPUISD::CARRY: case AMDGPUISD::BORROW: return 31; + case AMDGPUISD::BUFFER_LOAD_BYTE: + return 25; + case AMDGPUISD::BUFFER_LOAD_SHORT: + return 17; + case AMDGPUISD::BUFFER_LOAD_UBYTE: + return 24; + case AMDGPUISD::BUFFER_LOAD_USHORT: + return 16; case AMDGPUISD::FP_TO_FP16: case AMDGPUISD::FP16_ZEXT: return 16; Index: lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- lib/Target/AMDGPU/BUFInstructions.td +++ lib/Target/AMDGPU/BUFInstructions.td @@ -1132,6 +1132,10 @@ defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; multiclass MUBUF_StoreIntrinsicPat { @@ -1196,6 +1200,8 @@ defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; //===----------------------------------------------------------------------===// // buffer_atomic patterns Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -216,11 +216,15 @@ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::BR_CC, MVT::i1, Expand); @@ -5624,6 +5628,37 @@ if (LoadVT.getScalarType() == MVT::f16) return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops); + + // handle buffer_load_ubyte/byte/ushort/short overloaded intrinsics + if (LoadVT.getScalarType() == MVT::i8 || + LoadVT.getScalarType() == MVT::i16) { + + // set Opc based on data type + if (LoadVT.getScalarType() == MVT::i8) + Opc = AMDGPUISD::BUFFER_LOAD_UBYTE; + else + Opc = AMDGPUISD::BUFFER_LOAD_USHORT; + + // set node opcode if buffer_load_byte/short + if (Op.hasOneUse()) { + if (M->use_begin()->getOpcode() == ISD::SIGN_EXTEND) { + if (LoadVT.getScalarType() == MVT::i8) + Opc = AMDGPUISD::BUFFER_LOAD_BYTE; + else + Opc = AMDGPUISD::BUFFER_LOAD_SHORT; + } + } + SDVTList ResList = DAG.getVTList(MVT::i32, Ops[0].getValueType()); + SDValue BufferLoad = DAG.getMemIntrinsicNode(Opc, DL, ResList, + Ops, IntVT, + M->getMemOperand()); + SDValue BufferLoadTrunc = DAG.getNode(ISD::TRUNCATE, DL, + LoadVT.getScalarType(), BufferLoad); + SDValue BufferLoadMerge = DAG.getMergeValues({BufferLoadTrunc, + BufferLoad.getValue(1)}, DL); + return BufferLoadMerge; + } + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, M->getMemOperand()); } @@ -5652,6 +5687,37 @@ if (LoadVT.getScalarType() == MVT::f16) return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops); + + // handle buffer_load_ubyte/byte/ushort/short overloaded intrinsics + if (LoadVT.getScalarType() == MVT::i8 || + LoadVT.getScalarType() == MVT::i16) { + + // set Opc based on data type + if (LoadVT.getScalarType() == MVT::i8) + Opc = AMDGPUISD::BUFFER_LOAD_UBYTE; + else + Opc = AMDGPUISD::BUFFER_LOAD_USHORT; + + // set node opcode if buffer_load_byte/short + if (Op.hasOneUse()) { + if (M->use_begin()->getOpcode() == ISD::SIGN_EXTEND) { + if (LoadVT.getScalarType() == MVT::i8) + Opc = AMDGPUISD::BUFFER_LOAD_BYTE; + else + Opc = AMDGPUISD::BUFFER_LOAD_SHORT; + } + } + SDVTList ResList = DAG.getVTList(MVT::i32, Ops[0].getValueType()); + SDValue BufferLoad = DAG.getMemIntrinsicNode(Opc, DL, ResList, + Ops, IntVT, + M->getMemOperand()); + SDValue BufferLoadTrunc = DAG.getNode(ISD::TRUNCATE, DL, + LoadVT.getScalarType(), BufferLoad); + SDValue BufferLoadMerge = DAG.getMergeValues({BufferLoadTrunc, + BufferLoad.getValue(1)}, DL); + return BufferLoadMerge; + } + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, M->getMemOperand()); } @@ -5680,6 +5746,37 @@ if (LoadVT.getScalarType() == MVT::f16) return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops); + + // handle buffer_load_ubyte/byte/ushort/short overloaded intrinsics + if (LoadVT.getScalarType() == MVT::i8 || + LoadVT.getScalarType() == MVT::i16) { + + // set Opc based on data type + if (LoadVT.getScalarType() == MVT::i8) + Opc = AMDGPUISD::BUFFER_LOAD_UBYTE; + else + Opc = AMDGPUISD::BUFFER_LOAD_USHORT; + + // set node opcode if buffer_load_byte/short + if (Op.hasOneUse()) { + if (M->use_begin()->getOpcode() == ISD::SIGN_EXTEND) { + if (LoadVT.getScalarType() == MVT::i8) + Opc = AMDGPUISD::BUFFER_LOAD_BYTE; + else + Opc = AMDGPUISD::BUFFER_LOAD_SHORT; + } + } + SDVTList ResList = DAG.getVTList(MVT::i32, Ops[0].getValueType()); + SDValue BufferLoad = DAG.getMemIntrinsicNode(Opc, DL, ResList, + Ops, IntVT, + M->getMemOperand()); + SDValue BufferLoadTrunc = DAG.getNode(ISD::TRUNCATE, DL, + LoadVT.getScalarType(), BufferLoad); + SDValue BufferLoadMerge = DAG.getMergeValues({BufferLoadTrunc, + BufferLoad.getValue(1)}, DL); + return BufferLoadMerge; + } + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, M->getMemOperand()); } @@ -6250,6 +6347,23 @@ AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast(Op); + + // handle buffer_store_byte/short overloaded intrinsics + EVT VDataType = VData.getValueType().getScalarType(); + if (VDataType == MVT::i8 || VDataType == MVT::i16) { + SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, + MVT::i32, Ops[1]); + Ops[1] = BufferStoreExt; + if (VDataType == MVT::i8) + Opc = AMDGPUISD::BUFFER_STORE_BYTE; + else + Opc = AMDGPUISD::BUFFER_STORE_SHORT; + SDValue BufferStore = DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), + Ops, VDataType, + M->getMemOperand()); + return BufferStore; + } + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } @@ -6276,6 +6390,23 @@ AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast(Op); + + // handle buffer_store_byte/short overloaded intrinsics + EVT VDataType = VData.getValueType().getScalarType(); + if (VDataType == MVT::i8 || VDataType == MVT::i16) { + SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, + MVT::i32, Ops[1]); + Ops[1] = BufferStoreExt; + if (VDataType == MVT::i8) + Opc = AMDGPUISD::BUFFER_STORE_BYTE; + else + Opc = AMDGPUISD::BUFFER_STORE_SHORT; + SDValue BufferStore = DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), + Ops, VDataType, + M->getMemOperand()); + return BufferStore; + } + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } @@ -6302,6 +6433,23 @@ AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast(Op); + + // handle buffer_store_byte/short overloaded intrinsics + EVT VDataType = VData.getValueType().getScalarType(); + if (VDataType == MVT::i8 || VDataType == MVT::i16) { + SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, + MVT::i32, Ops[1]); + Ops[1] = BufferStoreExt; + if (VDataType == MVT::i8) + Opc = AMDGPUISD::BUFFER_STORE_BYTE; + else + Opc = AMDGPUISD::BUFFER_STORE_SHORT; + SDValue BufferStore = DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), + Ops, VDataType, + M->getMemOperand()); + return BufferStore; + } + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -120,6 +120,14 @@ def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SIbuffer_load_ubyte : SDNode <"AMDGPUISD::BUFFER_LOAD_UBYTE", SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SIbuffer_load_ushort : SDNode <"AMDGPUISD::BUFFER_LOAD_USHORT", SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SIbuffer_load_byte : SDNode <"AMDGPUISD::BUFFER_LOAD_BYTE", SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SIbuffer_load_short: SDNode <"AMDGPUISD::BUFFER_LOAD_SHORT", SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; def SIbuffer_load_format_d16 : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT_D16", @@ -138,6 +146,12 @@ def SIbuffer_store : SDNode <"AMDGPUISD::BUFFER_STORE", SDTBufferStore, [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; +def SIbuffer_store_byte: SDNode <"AMDGPUISD::BUFFER_STORE_BYTE", + SDTBufferStore, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; +def SIbuffer_store_short : SDNode <"AMDGPUISD::BUFFER_STORE_SHORT", + SDTBufferStore, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; def SIbuffer_store_format : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT", SDTBufferStore, [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll @@ -257,9 +257,178 @@ ret void } +;CHECK-LABEL: {{^}}buffer_load_ubyte: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_ubyte(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %val = uitofp i8 %tmp to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_ushort: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, off, s[0:3], 0 offset:16 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_ushort(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0) + %tmp2 = zext i16 %tmp to i32 + %val = uitofp i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_sbyte: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_sbyte(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp2 = sext i8 %tmp to i32 + %val = sitofp i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_sshort: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, off, s[0:3], 0 offset:16 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_sshort(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0) + %tmp2 = sext i16 %tmp to i32 + %val = sitofp i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_ubyte_bitcast: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_ubyte_bitcast(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp2 = zext i8 %tmp to i32 + %val = bitcast i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_ushort_bitcast: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_ushort_bitcast(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp2 = zext i16 %tmp to i32 + %val = bitcast i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_sbyte_bitcast: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_sbyte_bitcast(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp2 = sext i8 %tmp to i32 + %val = bitcast i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_sshort_bitcast: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_sshort_bitcast(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp2 = sext i16 %tmp to i32 + %val = bitcast i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_ubyte_mul_bitcast: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_mul_u32_u24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}} +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_ubyte_mul_bitcast(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp2 = zext i8 %tmp to i32 + %tmp3 = mul i32 %tmp2, 255 + %val = bitcast i32 %tmp3 to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_ushort_mul_bitcast: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_mul_u32_u24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}} +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_ushort_mul_bitcast(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp2 = zext i16 %tmp to i32 + %tmp3 = mul i32 %tmp2, 255 + %val = bitcast i32 %tmp3 to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_sbyte_mul_bitcast: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_mul_i32_i24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}} +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_sbyte_mul_bitcast(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp2 = sext i8 %tmp to i32 + %tmp3 = mul i32 %tmp2, 255 + %val = bitcast i32 %tmp3 to float + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_sshort_mul_bitcast: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_mul_i32_i24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}} +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @buffer_load_sshort_mul_bitcast(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %tmp2 = sext i16 %tmp to i32 + %tmp3 = mul i32 %tmp2, 255 + %val = bitcast i32 %tmp3 to float + ret float %val +} + declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0 declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0 declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0 +declare i8 @llvm.amdgcn.buffer.load.i8(<4 x i32>, i32, i32, i1, i1) #0 +declare i16 @llvm.amdgcn.buffer.load.i16(<4 x i32>, i32, i32, i1, i1) #0 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 attributes #0 = { nounwind readonly } Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll @@ -233,9 +233,35 @@ ret void } +;CHECK-LABEL: {{^}}buffer_store_byte: +;CHECK-NOT: s_waitcnt +;CHECK-NEXT: %bb. +;CHECK: buffer_store_byte v{{[0-9]}}, off, s[0:3], 0 offset:8 +define amdgpu_ps void @buffer_store_byte(<4 x i32> inreg %rsrc, float %v1) { +main_body: + %v2 = fptoui float %v1 to i32 + %v3 = trunc i32 %v2 to i8 + call void @llvm.amdgcn.buffer.store.i8(i8 %v3, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_short: +;CHECK-NOT: s_waitcnt +;CHECK-NEXT: %bb. +;CHECK: buffer_store_short v{{[0-9]}}, off, s[0:3], 0 offset:16 +define amdgpu_ps void @buffer_store_short(<4 x i32> inreg %rsrc, float %v1) { +main_body: + %v2 = fptoui float %v1 to i32 + %v3 = trunc i32 %v2 to i16 + call void @llvm.amdgcn.buffer.store.i16(i16 %v3, <4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0) + ret void +} + declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #0 declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0 declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.buffer.store.i8(i8, <4 x i32>, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.buffer.store.i16(i16, <4 x i32>, i32, i32, i1, i1) #0 declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #1 attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll @@ -263,6 +263,62 @@ ret {<4 x float>, <2 x float>, float} %r2 } +;CHECK-LABEL: {{^}}raw_buffer_load_ubyte: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0 +;CHECK: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @raw_buffer_load_ubyte(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %tmp2 = zext i8 %tmp to i32 + %val = uitofp i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}raw_buffer_load_ushort: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, off, s[0:3], 0 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @raw_buffer_load_ushort(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %tmp2 = zext i16 %tmp to i32 + %val = uitofp i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}raw_buffer_load_sbyte: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, off, s[0:3], 0 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @raw_buffer_load_sbyte(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %tmp2 = sext i8 %tmp to i32 + %val = sitofp i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}raw_buffer_load_sshort: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, off, s[0:3], 0 +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @raw_buffer_load_sshort(<4 x i32> inreg %rsrc) { +main_body: + %tmp = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %tmp2 = sext i16 %tmp to i32 + %val = sitofp i32 %tmp2 to float + ret float %val +} + declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #0 declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32) #0 declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0 @@ -270,5 +326,7 @@ declare <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32>, i32, i32, i32) #0 declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32) #0 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32>, i32, i32, i32) #0 +declare i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32>, i32, i32, i32) #0 attributes #0 = { nounwind readonly } Index: test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll @@ -189,6 +189,32 @@ ret void } +;CHECK-LABEL: {{^}}raw_buffer_store_byte: +;CHECK-NEXT: %bb. +;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} +;CHECK-NEXT: buffer_store_byte v{{[0-9]}}, off, s[0:3], 0 +;CHECK-NEXT: s_endpgm +define amdgpu_ps void @raw_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1) { +main_body: + %v2 = fptoui float %v1 to i32 + %v3 = trunc i32 %v2 to i8 + call void @llvm.amdgcn.raw.buffer.store.i8(i8 %v3, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}raw_buffer_store_short: +;CHECK-NEXT: %bb. +;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} +;CHECK-NEXT: buffer_store_short v{{[0-9]}}, off, s[0:3], 0 +;CHECK-NEXT: s_endpgm +define amdgpu_ps void @raw_buffer_store_short(<4 x i32> inreg %rsrc, float %v1) { +main_body: + %v2 = fptoui float %v1 to i32 + %v3 = trunc i32 %v2 to i16 + call void @llvm.amdgcn.raw.buffer.store.i16(i16 %v3, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32) #0 declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32) #0 declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #0 @@ -196,6 +222,8 @@ declare void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32) #0 declare void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32) #0 declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #1 +declare void @llvm.amdgcn.raw.buffer.store.i8(i8, <4 x i32>, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.i16(i16, <4 x i32>, i32, i32, i32) #0 attributes #0 = { nounwind } attributes #1 = { nounwind readonly } Index: test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll @@ -144,6 +144,62 @@ ret {<4 x float>, <2 x float>, float} %r2 } +;CHECK-LABEL: {{^}}struct_buffer_load_ubyte: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, v[0:1], s[0:3], 0 idxen offen +;CHECK: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @struct_buffer_load_ubyte(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +main_body: + %tmp = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) + %tmp2 = zext i8 %tmp to i32 + %val = uitofp i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}struct_buffer_load_ushort: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, v[0:1], s[0:3], 0 idxen offen +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @struct_buffer_load_ushort(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +main_body: + %tmp = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) + %tmp2 = zext i16 %tmp to i32 + %val = uitofp i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}struct_buffer_load_sbyte: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, v[0:1], s[0:3], 0 idxen offen +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @struct_buffer_load_sbyte(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +main_body: + %tmp = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) + %tmp2 = sext i8 %tmp to i32 + %val = sitofp i32 %tmp2 to float + ret float %val +} + +;CHECK-LABEL: {{^}}struct_buffer_load_sshort: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, v[0:1], s[0:3], 0 idxen offen +;CHECK-NEXT: s_waitcnt vmcnt(0) +;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 +;CHECK-NEXT: ; return to shader part epilog +define amdgpu_ps float @struct_buffer_load_sshort(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +main_body: + %tmp = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) + %tmp2 = sext i16 %tmp to i32 + %val = sitofp i32 %tmp2 to float + ret float %val +} + declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #0 declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) #0 declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) #0 @@ -151,5 +207,7 @@ declare <2 x i32> @llvm.amdgcn.struct.buffer.load.v2i32(<4 x i32>, i32, i32, i32, i32) #0 declare <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) #0 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32>, i32, i32, i32, i32) #0 +declare i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32>, i32, i32, i32, i32) #0 attributes #0 = { nounwind readonly } Index: test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll @@ -108,6 +108,32 @@ ret void } +;CHECK-LABEL: {{^}}struct_buffer_store_byte: +;CHECK-NEXT: %bb. +;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} +;CHECK-NEXT: buffer_store_byte v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen +;CHECK-NEXT: s_endpgm +define amdgpu_ps void @struct_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1, i32 %index) { +main_body: + %v2 = fptoui float %v1 to i32 + %v3 = trunc i32 %v2 to i8 + call void @llvm.amdgcn.struct.buffer.store.i8(i8 %v3, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}struct_buffer_store_short: +;CHECK-NEXT: %bb. +;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} +;CHECK-NEXT: buffer_store_short v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen +;CHECK-NEXT: s_endpgm +define amdgpu_ps void @struct_buffer_store_short(<4 x i32> inreg %rsrc, float %v1, i32 %index) { +main_body: + %v2 = fptoui float %v1 to i32 + %v3 = trunc i32 %v2 to i16 + call void @llvm.amdgcn.struct.buffer.store.i16(i16 %v3, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + ret void +} + declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32) #0 declare void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32, i32) #0 declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #0 @@ -115,6 +141,8 @@ declare void @llvm.amdgcn.struct.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32) #0 declare void @llvm.amdgcn.struct.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32) #0 declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) #1 +declare void @llvm.amdgcn.struct.buffer.store.i8(i8, <4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.i16(i16, <4 x i32>, i32, i32, i32, i32) #0 attributes #0 = { nounwind } attributes #1 = { nounwind readonly }