Index: lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -2002,6 +2002,12 @@ dbgs() << "\n"); SDValue R = SDValue(); + // See if the target wants to custom expand this node. + if (CustomLowerNode(N, N->getValueType(ResNo), true)) { + LLVM_DEBUG(dbgs() << "Node has been custom expanded, done\n"); + return; + } + switch (N->getOpcode()) { // These opcodes cannot appear if promotion of FP16 is done in the backend // instead of Clang Index: lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- lib/Target/AMDGPU/BUFInstructions.td +++ lib/Target/AMDGPU/BUFInstructions.td @@ -1182,8 +1182,12 @@ defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -94,6 +94,9 @@ SelectionDAG &DAG, ArrayRef Ops, bool IsIntrinsic = false) const; + SDValue lowerIntrinsicLoad(MemSDNode *M, bool IsFormat, SelectionDAG &DAG, + ArrayRef Ops) const; + // Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to // dwordx4 if on SI. SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -674,9 +674,12 @@ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2i16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4i16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); @@ -4114,6 +4117,41 @@ return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL); } +SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat, + SelectionDAG &DAG, + ArrayRef Ops) const { + SDLoc DL(M); + EVT LoadVT = M->getValueType(0); + EVT EltType = LoadVT.getScalarType(); + EVT IntVT = LoadVT.changeTypeToInteger(); + + bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); + + unsigned Opc = + IsFormat ? AMDGPUISD::BUFFER_LOAD_FORMAT : AMDGPUISD::BUFFER_LOAD; + + if (IsD16) { + return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops); + } + + // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics + if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32) + return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); + + if (isTypeLegal(LoadVT)) { + return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT, + M->getMemOperand(), DAG); + } + + EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT); + SDVTList VTList = DAG.getVTList(CastVT, MVT::Other); + SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT, + M->getMemOperand(), DAG); + return DAG.getMergeValues( + {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)}, + DL); +} + static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); @@ -4240,8 +4278,14 @@ } case ISD::INTRINSIC_W_CHAIN: { if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) { - Results.push_back(Res); - Results.push_back(Res.getValue(1)); + if (Res.getOpcode() == ISD::MERGE_VALUES) { + // FIXME: Hacky + Results.push_back(Res.getOperand(0)); + Results.push_back(Res.getOperand(1)); + } else { + Results.push_back(Res); + Results.push_back(Res.getValue(1)); + } return; } @@ -6199,6 +6243,8 @@ } case Intrinsic::amdgcn_raw_buffer_load: case Intrinsic::amdgcn_raw_buffer_load_format: { + const bool IsFormat = IntrID == Intrinsic::amdgcn_raw_buffer_load_format; + auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG); SDValue Ops[] = { Op.getOperand(0), // Chain @@ -6211,28 +6257,12 @@ DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; - unsigned Opc = (IntrID == Intrinsic::amdgcn_raw_buffer_load) ? - AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; - - EVT VT = Op.getValueType(); - EVT IntVT = VT.changeTypeToInteger(); - auto *M = cast(Op); - EVT LoadVT = Op.getValueType(); - - if (LoadVT.getScalarType() == MVT::f16) - return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, - M, DAG, Ops); - - // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics - if (LoadVT.getScalarType() == MVT::i8 || - LoadVT.getScalarType() == MVT::i16) - return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); - - return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, - M->getMemOperand(), DAG); + return lowerIntrinsicLoad(cast(Op), IsFormat, DAG, Ops); } case Intrinsic::amdgcn_struct_buffer_load: case Intrinsic::amdgcn_struct_buffer_load_format: { + const bool IsFormat = IntrID == Intrinsic::amdgcn_struct_buffer_load_format; + auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); SDValue Ops[] = { Op.getOperand(0), // Chain @@ -6245,25 +6275,7 @@ DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; - unsigned Opc = (IntrID == Intrinsic::amdgcn_struct_buffer_load) ? - AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; - - EVT VT = Op.getValueType(); - EVT IntVT = VT.changeTypeToInteger(); - auto *M = cast(Op); - EVT LoadVT = Op.getValueType(); - - if (LoadVT.getScalarType() == MVT::f16) - return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, - M, DAG, Ops); - - // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics - if (LoadVT.getScalarType() == MVT::i8 || - LoadVT.getScalarType() == MVT::i16) - return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); - - return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, - M->getMemOperand(), DAG); + return lowerIntrinsicLoad(cast(Op), IsFormat, DAG, Ops); } case Intrinsic::amdgcn_tbuffer_load: { MemSDNode *M = cast(Op); @@ -7121,9 +7133,10 @@ SDValue BufferLoad = DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, M->getMemOperand()); - SDValue BufferLoadTrunc = DAG.getNode(ISD::TRUNCATE, DL, - LoadVT.getScalarType(), BufferLoad); - return DAG.getMergeValues({BufferLoadTrunc, BufferLoad.getValue(1)}, DL); + SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad); + LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal); + + return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL); } // Handle 8 bit and 16 bit buffer stores Index: test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll @@ -298,13 +298,13 @@ ret float %val } -;CHECK-LABEL: {{^}}raw_buffer_load_ushort: +;CHECK-LABEL: {{^}}raw_buffer_load_i16: ;CHECK-NEXT: %bb. ;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, off, s[0:3], 0 ;CHECK: s_waitcnt vmcnt(0) ;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0 ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @raw_buffer_load_ushort(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @raw_buffer_load_i16(<4 x i32> inreg %rsrc) { main_body: %tmp = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) %tmp2 = zext i16 %tmp to i32 @@ -340,6 +340,66 @@ ret float %val } +;CHECK-LABEL: {{^}}raw_buffer_load_f16: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ushort [[VAL:v[0-9]+]], off, s[0:3], 0 +;CHECK: s_waitcnt vmcnt(0) +;CHECK: ds_write_b16 v0, [[VAL]] +define amdgpu_ps void @raw_buffer_load_f16(<4 x i32> inreg %rsrc, half addrspace(3)* %ptr) { +main_body: + %val = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + store half %val, half addrspace(3)* %ptr + ret void +} + +;CHECK-LABEL: {{^}}raw_buffer_load_v2f16: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dword [[VAL:v[0-9]+]], off, s[0:3], 0 +;CHECK: s_waitcnt vmcnt(0) +;CHECK: ds_write_b32 v0, [[VAL]] +define amdgpu_ps void @raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, <2 x half> addrspace(3)* %ptr) { +main_body: + %val = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + store <2 x half> %val, <2 x half> addrspace(3)* %ptr + ret void +} + +;CHECK-LABEL: {{^}}raw_buffer_load_v4f16: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], off, s[0:3], 0 +;CHECK: s_waitcnt vmcnt(0) +;CHECK: ds_write_b64 v0, [[VAL]] +define amdgpu_ps void @raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, <4 x half> addrspace(3)* %ptr) { +main_body: + %val = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + store <4 x half> %val, <4 x half> addrspace(3)* %ptr + ret void +} + +;CHECK-LABEL: {{^}}raw_buffer_load_v2i16: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dword [[VAL:v[0-9]+]], off, s[0:3], 0 +;CHECK: s_waitcnt vmcnt(0) +;CHECK: ds_write_b32 v0, [[VAL]] +define amdgpu_ps void @raw_buffer_load_v2i16(<4 x i32> inreg %rsrc, <2 x i16> addrspace(3)* %ptr) { +main_body: + %val = call <2 x i16> @llvm.amdgcn.raw.buffer.load.v2i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + store <2 x i16> %val, <2 x i16> addrspace(3)* %ptr + ret void +} + +;CHECK-LABEL: {{^}}raw_buffer_load_v4i16: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], off, s[0:3], 0 +;CHECK: s_waitcnt vmcnt(0) +;CHECK: ds_write_b64 v0, [[VAL]] +define amdgpu_ps void @raw_buffer_load_v4i16(<4 x i32> inreg %rsrc, <4 x i16> addrspace(3)* %ptr) { +main_body: + %val = call <4 x i16> @llvm.amdgcn.raw.buffer.load.v4i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + store <4 x i16> %val, <4 x i16> addrspace(3)* %ptr + ret void +} + declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #0 declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32) #0 declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0 @@ -349,5 +409,10 @@ declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 declare i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32>, i32, i32, i32) #0 declare i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32>, i32, i32, i32) #0 +declare <2 x i16> @llvm.amdgcn.raw.buffer.load.v2i16(<4 x i32>, i32, i32, i32) #0 +declare <4 x i16> @llvm.amdgcn.raw.buffer.load.v4i16(<4 x i32>, i32, i32, i32) #0 +declare half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32>, i32, i32, i32) #0 +declare <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32>, i32, i32, i32) #0 +declare <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32>, i32, i32, i32) #0 attributes #0 = { nounwind readonly } Index: test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll @@ -200,6 +200,78 @@ ret float %val } +;CHECK-LABEL: {{^}}struct_buffer_load_f16: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ushort [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen +;CHECK: s_waitcnt vmcnt(0) +;CHECK: ds_write_b16 v0, [[VAL]] +define amdgpu_ps void @struct_buffer_load_f16(<4 x i32> inreg %rsrc, half addrspace(3)* %ptr, i32 %idx) { +main_body: + %val = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0) + store half %val, half addrspace(3)* %ptr + ret void +} + +;CHECK-LABEL: {{^}}struct_buffer_load_v2f16: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dword [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen +;CHECK: s_waitcnt vmcnt(0) +;CHECK: ds_write_b32 v0, [[VAL]] +define amdgpu_ps void @struct_buffer_load_v2f16(<4 x i32> inreg %rsrc, <2 x half> addrspace(3)* %ptr, i32 %idx) { +main_body: + %val = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0) + store <2 x half> %val, <2 x half> addrspace(3)* %ptr + ret void +} + +;CHECK-LABEL: {{^}}struct_buffer_load_v4f16: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], v1, s[0:3], 0 idxen +;CHECK: s_waitcnt vmcnt(0) +;CHECK: ds_write_b64 v0, [[VAL]] +define amdgpu_ps void @struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, <4 x half> addrspace(3)* %ptr, i32 %idx) { +main_body: + %val = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0) + store <4 x half> %val, <4 x half> addrspace(3)* %ptr + ret void +} + +;CHECK-LABEL: {{^}}struct_buffer_load_i16: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ushort [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen +;CHECK: s_waitcnt vmcnt(0) +;CHECK: ds_write_b16 v0, [[VAL]] +define amdgpu_ps void @struct_buffer_load_i16(<4 x i32> inreg %rsrc, i16 addrspace(3)* %ptr, i32 %idx) { +main_body: + %val = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0) + store i16 %val, i16 addrspace(3)* %ptr + ret void +} + +;CHECK-LABEL: {{^}}struct_buffer_load_v2i16: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dword [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen +;CHECK: s_waitcnt vmcnt(0) +;CHECK: ds_write_b32 v0, [[VAL]] +define amdgpu_ps void @struct_buffer_load_v2i16(<4 x i32> inreg %rsrc, <2 x i16> addrspace(3)* %ptr, i32 %idx) { +main_body: + %val = call <2 x i16> @llvm.amdgcn.struct.buffer.load.v2i16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0) + store <2 x i16> %val, <2 x i16> addrspace(3)* %ptr + ret void +} + +;CHECK-LABEL: {{^}}struct_buffer_load_v4i16: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], v1, s[0:3], 0 idxen +;CHECK: s_waitcnt vmcnt(0) +;CHECK: ds_write_b64 v0, [[VAL]] +define amdgpu_ps void @struct_buffer_load_v4i16(<4 x i32> inreg %rsrc, <4 x i16> addrspace(3)* %ptr, i32 %idx) { +main_body: + %val = call <4 x i16> @llvm.amdgcn.struct.buffer.load.v4i16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0) + store <4 x i16> %val, <4 x i16> addrspace(3)* %ptr + ret void +} + declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #0 declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) #0 declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) #0 @@ -208,6 +280,13 @@ declare <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) #0 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 declare i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32>, i32, i32, i32, i32) #0 + +declare half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32>, i32, i32, i32, i32) #0 +declare <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32>, i32, i32, i32, i32) #0 +declare <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32>, i32, i32, i32, i32) #0 + declare i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32>, i32, i32, i32, i32) #0 +declare <2 x i16> @llvm.amdgcn.struct.buffer.load.v2i16(<4 x i32>, i32, i32, i32, i32) #0 +declare <4 x i16> @llvm.amdgcn.struct.buffer.load.v4i16(<4 x i32>, i32, i32, i32, i32) #0 attributes #0 = { nounwind readonly }