Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -490,6 +490,24 @@ llvm_i1_ty], // slc(imm) [], "", [SDNPMemOperand]>; +def int_amdgcn_buffer_load_ubyte : Intrinsic < + [llvm_float_ty], + [llvm_v4i32_ty, // rsrc(SGPR) + llvm_i32_ty, // vindex(VGPR) + llvm_i32_ty, // offset(SGPR/VGPR/imm) + llvm_i1_ty, // glc(imm) + llvm_i1_ty], // slc(imm) + [], "", [SDNPMemOperand]>; + +def int_amdgcn_buffer_load_ushort : Intrinsic < + [llvm_float_ty], + [llvm_v4i32_ty, // rsrc(SGPR) + llvm_i32_ty, // vindex(VGPR) + llvm_i32_ty, // offset(SGPR/VGPR/imm) + llvm_i1_ty, // glc(imm) + llvm_i1_ty], // slc(imm) + [], "", [SDNPMemOperand]>; + class AMDGPUBufferLoad : Intrinsic < [llvm_anyfloat_ty], [llvm_v4i32_ty, // rsrc(SGPR) @@ -501,6 +519,26 @@ def int_amdgcn_buffer_load_format : AMDGPUBufferLoad; def int_amdgcn_buffer_load : AMDGPUBufferLoad; +def int_amdgcn_buffer_store_byte : Intrinsic < + [], + [llvm_float_ty, // vdata(VGPR) -- this variant writes low 8 bits + llvm_v4i32_ty, // rsrc(SGPR) + llvm_i32_ty, // vindex(VGPR) + llvm_i32_ty, // offset(SGPR/VGPR/imm) + llvm_i1_ty, // glc(imm) + llvm_i1_ty], // slc(imm) + [IntrWriteMem], "", [SDNPMemOperand]>; + +def int_amdgcn_buffer_store_short : Intrinsic < + [], + [llvm_float_ty, // vdata(VGPR) -- this variant writes low 16 bits + llvm_v4i32_ty, // rsrc(SGPR) + llvm_i32_ty, // vindex(VGPR) + llvm_i32_ty, // offset(SGPR/VGPR/imm) + llvm_i1_ty, // glc(imm) + llvm_i1_ty], // slc(imm) + [IntrWriteMem], "", [SDNPMemOperand]>; + class AMDGPUBufferStore : Intrinsic < [], [llvm_anyfloat_ty, // vdata(VGPR) -- can currently only select f32, v2f32, v4f32 Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -465,9 +465,13 @@ ATOMIC_LOAD_FMIN, ATOMIC_LOAD_FMAX, BUFFER_LOAD, + BUFFER_LOAD_UBYTE, + BUFFER_LOAD_USHORT, BUFFER_LOAD_FORMAT, BUFFER_LOAD_FORMAT_D16, BUFFER_STORE, + BUFFER_STORE_BYTE, + BUFFER_STORE_SHORT, BUFFER_STORE_FORMAT, BUFFER_STORE_FORMAT_D16, BUFFER_ATOMIC_SWAP, Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3990,9 +3990,13 @@ NODE_NAME_CASE(ATOMIC_LOAD_FMIN) NODE_NAME_CASE(ATOMIC_LOAD_FMAX) NODE_NAME_CASE(BUFFER_LOAD) + NODE_NAME_CASE(BUFFER_LOAD_UBYTE) + NODE_NAME_CASE(BUFFER_LOAD_USHORT) NODE_NAME_CASE(BUFFER_LOAD_FORMAT) NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16) NODE_NAME_CASE(BUFFER_STORE) + NODE_NAME_CASE(BUFFER_STORE_BYTE) + NODE_NAME_CASE(BUFFER_STORE_SHORT) NODE_NAME_CASE(BUFFER_STORE_FORMAT) NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16) NODE_NAME_CASE(BUFFER_ATOMIC_SWAP) Index: lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- lib/Target/AMDGPU/BUFInstructions.td +++ lib/Target/AMDGPU/BUFInstructions.td @@ -1016,6 +1016,8 @@ defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; multiclass MUBUF_StoreIntrinsicPat { @@ -1060,6 +1062,9 @@ defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; + let SubtargetPredicate = HasUnpackedD16VMem in { defm : MUBUF_StoreIntrinsicPat; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -735,6 +735,8 @@ } case Intrinsic::amdgcn_tbuffer_load: case Intrinsic::amdgcn_buffer_load: + case Intrinsic::amdgcn_buffer_load_ubyte: + case Intrinsic::amdgcn_buffer_load_ushort: case Intrinsic::amdgcn_buffer_load_format: { SIMachineFunctionInfo *MFI = MF.getInfo(); Info.opc = ISD::INTRINSIC_W_CHAIN; @@ -751,6 +753,8 @@ } case Intrinsic::amdgcn_tbuffer_store: case Intrinsic::amdgcn_buffer_store: + case Intrinsic::amdgcn_buffer_store_byte: + case Intrinsic::amdgcn_buffer_store_short: case Intrinsic::amdgcn_buffer_store_format: { SIMachineFunctionInfo *MFI = MF.getInfo(); Info.opc = ISD::INTRINSIC_VOID; @@ -4903,6 +4907,8 @@ M->getMemoryVT(), M->getMemOperand()); } case Intrinsic::amdgcn_buffer_load: + case Intrinsic::amdgcn_buffer_load_ubyte: + case Intrinsic::amdgcn_buffer_load_ushort: case Intrinsic::amdgcn_buffer_load_format: { SDValue Ops[] = { Op.getOperand(0), // Chain @@ -4913,8 +4919,15 @@ Op.getOperand(6) // slc }; - unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ? - AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; + unsigned Opc = 0; + switch (IntrID) { + case Intrinsic::amdgcn_buffer_load: Opc = AMDGPUISD::BUFFER_LOAD; break; + case Intrinsic::amdgcn_buffer_load_ubyte: Opc = AMDGPUISD::BUFFER_LOAD_UBYTE; break; + case Intrinsic::amdgcn_buffer_load_ushort: Opc = AMDGPUISD::BUFFER_LOAD_USHORT; break; + case Intrinsic::amdgcn_buffer_load_format: Opc = AMDGPUISD::BUFFER_LOAD_FORMAT; break; + default: llvm_unreachable("Unexpected IntrinsicID"); + } + EVT VT = Op.getValueType(); EVT IntVT = VT.changeTypeToInteger(); @@ -5274,6 +5287,8 @@ } case Intrinsic::amdgcn_buffer_store: + case Intrinsic::amdgcn_buffer_store_byte: + case Intrinsic::amdgcn_buffer_store_short: case Intrinsic::amdgcn_buffer_store_format: { SDValue VData = Op.getOperand(2); bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); @@ -5288,8 +5303,14 @@ Op.getOperand(6), // glc Op.getOperand(7) // slc }; - unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ? - AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; + unsigned Opc = 0; + switch (IntrinsicID) { + case Intrinsic::amdgcn_buffer_store: Opc = AMDGPUISD::BUFFER_STORE; break; + case Intrinsic::amdgcn_buffer_store_byte: Opc = AMDGPUISD::BUFFER_STORE_BYTE; break; + case Intrinsic::amdgcn_buffer_store_short: Opc = AMDGPUISD::BUFFER_STORE_SHORT; break; + case Intrinsic::amdgcn_buffer_store_format: Opc = AMDGPUISD::BUFFER_STORE_FORMAT; break; + default: llvm_unreachable("Unexpected IntrinsicID"); + } Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast(Op); return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -113,6 +113,10 @@ def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SIbuffer_load_ubyte : SDNode <"AMDGPUISD::BUFFER_LOAD_UBYTE", SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SIbuffer_load_ushort : SDNode <"AMDGPUISD::BUFFER_LOAD_USHORT", SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; def SIbuffer_load_format_d16 : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT_D16", @@ -129,6 +133,10 @@ def SIbuffer_store : SDNode <"AMDGPUISD::BUFFER_STORE", SDTBufferStore, [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; +def SIbuffer_store_byte : SDNode <"AMDGPUISD::BUFFER_STORE_BYTE", SDTBufferStore, + [SDNPMemOperand, SDNPHasChain, SDNPMayStore]>; +def SIbuffer_store_short : SDNode <"AMDGPUISD::BUFFER_STORE_SHORT", SDTBufferStore, + [SDNPMemOperand, SDNPHasChain, SDNPMayStore]>; def SIbuffer_store_format : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT", SDTBufferStore, [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll @@ -227,9 +227,31 @@ ret void } +;CHECK-LABEL: {{^}}buffer_load_ubyte: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK: s_waitcnt +define amdgpu_ps float @buffer_load_ubyte(<4 x i32> inreg %rsrc) { +main_body: + %val = call float @llvm.amdgcn.buffer.load.ubyte(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_ushort: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, off, s[0:3], 0 offset:16 +;CHECK: s_waitcnt +define amdgpu_ps float @buffer_load_ushort(<4 x i32> inreg %rsrc) { +main_body: + %val = call float @llvm.amdgcn.buffer.load.ushort(<4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0) + ret float %val +} + declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0 declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0 declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0 +declare float @llvm.amdgcn.buffer.load.ubyte(<4 x i32>, i32, i32, i1, i1) #0 +declare float @llvm.amdgcn.buffer.load.ushort(<4 x i32>, i32, i32, i1, i1) #0 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 attributes #0 = { nounwind readonly } Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll @@ -170,9 +170,31 @@ ret void } +;CHECK-LABEL: {{^}}buffer_store_byte: +;CHECK-NOT: s_waitcnt +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_store_byte v{{[0-9]}}, off, s[0:3], 0 offset:8 +define amdgpu_ps void @buffer_store_byte(<4 x i32> inreg %rsrc, float %v1) { +main_body: + call void @llvm.amdgcn.buffer.store.byte(float %v1, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_short: +;CHECK-NOT: s_waitcnt +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_store_short v{{[0-9]}}, off, s[0:3], 0 offset:16 +define amdgpu_ps void @buffer_store_short(<4 x i32> inreg %rsrc, float %v1) { +main_body: + call void @llvm.amdgcn.buffer.store.short(float %v1, <4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0) + ret void +} + declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #0 declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0 declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.buffer.store.byte(float, <4 x i32>, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.buffer.store.short(float, <4 x i32>, i32, i32, i1, i1) #0 declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #1 attributes #0 = { nounwind }