Index: lib/Target/R600/AMDGPUISelLowering.h =================================================================== --- lib/Target/R600/AMDGPUISelLowering.h +++ lib/Target/R600/AMDGPUISelLowering.h @@ -196,6 +196,14 @@ SAMPLEB, SAMPLED, SAMPLEL, + + // Convert the low 16-bits of the 32-bit integer input to a float. + // XXX - Would it be better to handle this is a subregister. + CVT_F16_F32, + + // Convert float to half and store in the low 16-bits of the 32-bit integer + // output. + CVT_F32_F16, FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, STORE_MSKOR, LOAD_CONSTANT, Index: lib/Target/R600/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/R600/AMDGPUISelLowering.cpp +++ lib/Target/R600/AMDGPUISelLowering.cpp @@ -84,6 +84,24 @@ #include "AMDGPUGenCallingConv.inc" +static EVT getFloatVT(LLVMContext &Context, EVT VT) { + return VT.isVector() ? + EVT::getVectorVT(Context, MVT::f32, VT.getVectorNumElements()) : + MVT::f32; +} + +static EVT getShortVT(LLVMContext &Context, EVT VT) { + return VT.isVector() ? + EVT::getVectorVT(Context, MVT::i16, VT.getVectorNumElements()) : + MVT::i16; +} + +static EVT getIntVT(LLVMContext &Context, EVT VT) { + return VT.isVector() ? + EVT::getVectorVT(Context, MVT::i32, VT.getVectorNumElements()) : + MVT::i32; +} + AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : TargetLowering(TM, new TargetLoweringObjectFileELF()) { @@ -154,6 +172,9 @@ setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand); setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f16, Custom); + setTruncStoreAction(MVT::f32, MVT::f16, Custom); + setTruncStoreAction(MVT::f64, MVT::f16, Custom); setOperationAction(ISD::LOAD, MVT::f32, Promote); AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); @@ -927,6 +948,28 @@ EVT VT = Op.getValueType(); EVT MemVT = Load->getMemoryVT(); + EVT MemEltVT = MemVT.getScalarType(); + if (MemEltVT == MVT::f16) { + assert(ExtType != ISD::NON_EXTLOAD); + + EVT ShortVT = getShortVT(*DAG.getContext(), VT); + EVT IntVT = getIntVT(*DAG.getContext(), VT); + EVT FloatVT = getFloatVT(*DAG.getContext(), VT); + + SDValue ExtLoad32 = DAG.getExtLoad(ExtType, DL, IntVT, + Load->getChain(), + Load->getBasePtr(), + ShortVT, + Load->getMemOperand()); + + SDValue Conv = DAG.getNode(AMDGPUISD::CVT_F32_F16, DL, FloatVT, ExtLoad32); + + if (VT.getScalarType() == MVT::f32) + return Conv; + + return DAG.getNode(ISD::FP_EXTEND, DL, VT, Conv); + } + if (ExtType != ISD::NON_EXTLOAD && !VT.isVector() && VT.getSizeInBits() > 32) { // We can do the extload to 32-bits, and then need to separately extend to // 64-bits. @@ -986,7 +1029,6 @@ Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt); - EVT MemEltVT = MemVT.getScalarType(); if (ExtType == ISD::SEXTLOAD) { SDValue MemEltVTNode = DAG.getValueType(MemEltVT); return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode); @@ -1011,6 +1053,28 @@ } EVT MemVT = Store->getMemoryVT(); + + EVT MemEltVT = MemVT.getScalarType(); + if (MemEltVT == MVT::f16) { + assert(Store->isTruncatingStore()); + + SDValue Val = Store->getValue(); + EVT VT = Val.getValueType(); + + if (VT.getScalarType() != MVT::f32) { + Val = DAG.getNode(ISD::FP_ROUND, DL, getFloatVT(*DAG.getContext(), VT), + Val, DAG.getConstant(0, MVT::i32)); + } + + SDValue Conv = DAG.getNode(AMDGPUISD::CVT_F16_F32, DL, MVT::i32, Val); + EVT ShortVT = VT.isVector() ? + EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements()) : + MVT::i16; + + return DAG.getTruncStore(Chain, DL, Conv, Store->getBasePtr(), ShortVT, + Store->getMemOperand()); + } + if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS && MemVT.bitsLT(MVT::i32)) { unsigned Mask = 0; @@ -1407,6 +1471,8 @@ NODE_NAME_CASE(SAMPLEB) NODE_NAME_CASE(SAMPLED) NODE_NAME_CASE(SAMPLEL) + NODE_NAME_CASE(CVT_F16_F32) + NODE_NAME_CASE(CVT_F32_F16) NODE_NAME_CASE(STORE_MSKOR) NODE_NAME_CASE(TBUFFER_STORE_FORMAT) } @@ -1460,6 +1526,11 @@ computeKnownBitsForMinMax(Op.getOperand(0), Op.getOperand(1), KnownZero, KnownOne, DAG, Depth); break; + + case AMDGPUISD::CVT_F16_F32: + KnownZero = APInt::getHighBitsSet(32, 16); + break; + default: break; } Index: lib/Target/R600/AMDGPUInstrInfo.td =================================================================== --- lib/Target/R600/AMDGPUInstrInfo.td +++ lib/Target/R600/AMDGPUInstrInfo.td @@ -100,3 +100,18 @@ def AMDGPUmul_i24 : SDNode<"AMDGPUISD::MUL_I24", SDTIntBinOp, [SDNPCommutative] >; + +// half is treated as a 32-bit integer. +def AMDGPUF16_F32Op : SDTypeProfile<1, 1, + [SDTCisInt<0>, SDTCisFP<1>]>; + + def AMDGPUF32_F16Op : SDTypeProfile<1, 1, + [SDTCisFP<0>, SDTCisInt<1>]>; + +def AMDGPUcvt_f16_f32 : SDNode<"AMDGPUISD::CVT_F16_F32", AMDGPUF16_F32Op, + [] +>; + +def AMDGPUcvt_f32_f16 : SDNode<"AMDGPUISD::CVT_F32_F16", AMDGPUF32_F16Op, + [] +>; Index: lib/Target/R600/SIInstructions.td =================================================================== --- lib/Target/R600/SIInstructions.td +++ lib/Target/R600/SIInstructions.td @@ -962,8 +962,15 @@ [(set i32:$dst, (fp_to_sint f32:$src0))] >; defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>; -////def V_CVT_F16_F32 : VOP1_F16 <0x0000000a, "V_CVT_F16_F32", []>; -//defm V_CVT_F32_F16 : VOP1_32 <0x0000000b, "V_CVT_F32_F16", []>; + +defm V_CVT_F16_F32 : VOP1_32 <0x0000000a, "V_CVT_F16_F32", + [(set i32:$dst, (AMDGPUcvt_f16_f32 f32:$src0))] +>; + +defm V_CVT_F32_F16 : VOP1_32 <0x0000000b, "V_CVT_F32_F16", + [(set f32:$dst, (AMDGPUcvt_f32_f16 i32:$src0))] +>; + //defm V_CVT_RPI_I32_F32 : VOP1_32 <0x0000000c, "V_CVT_RPI_I32_F32", []>; //defm V_CVT_FLR_I32_F32 : VOP1_32 <0x0000000d, "V_CVT_FLR_I32_F32", []>; //defm V_CVT_OFF_F32_I4 : VOP1_32 <0x0000000e, "V_CVT_OFF_F32_I4", []>; Index: test/CodeGen/R600/half.ll =================================================================== --- /dev/null +++ test/CodeGen/R600/half.ll @@ -0,0 +1,49 @@ +; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: @extload_half_to_float +; SI: BUFFER_LOAD_USHORT [[REG:v[0-9]+]], +; SI: V_CVT_F32_F16_e32 v{{[0-9]+}}, [[REG]] +; SI: BUFFER_STORE_DWORD +; SI: S_ENDPGM +define void @extload_half_to_float(float addrspace(1)* %out, half addrspace(1)* %in) nounwind { + %load = load half addrspace(1)* %in, align 2 + %ext = fpext half %load to float + store float %ext, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: @extload_half_to_double +; SI: BUFFER_LOAD_USHORT [[LOADREG:v[0-9]+]], +; SI: V_CVT_F32_F16_e32 [[CONVREG0:v[0-9]+]], [[LOADREG]] +; SI: V_CVT_F64_F32_e32 v{{\[[0-9]+:[0-9]+\]}}, [[CONVREG0]] +; SI: BUFFER_STORE_DWORDX2 +; SI: S_ENDPGM +define void @extload_half_to_double(double addrspace(1)* %out, half addrspace(1)* %in) nounwind { + %load = load half addrspace(1)* %in, align 2 + %ext = fpext half %load to double + store double %ext, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: @truncstore_float_to_half +; SI: V_CVT_F16_F32_e32 [[RESULT:v[0-9]+]], +; SI: BUFFER_STORE_SHORT [[RESULT]], +; SI: S_ENDPGM +define void @truncstore_float_to_half(half addrspace(1)* %out, float addrspace(1)* %in) nounwind { + %load = load float addrspace(1)* %in, align 4 + %ext = fptrunc float %load to half + store half %ext, half addrspace(1)* %out, align 2 + ret void +} + +; SI-LABEL: @truncstore_double_to_half +; SI: V_CVT_F32_F64_e32 [[CVTFLOAT:v[0-9]+]], +; SI: V_CVT_F16_F32_e32 [[RESULT:v[0-9]+]], [[CVTFLOAT]] +; SI: BUFFER_STORE_SHORT [[RESULT]], +; SI: S_ENDPGM +define void @truncstore_double_to_half(half addrspace(1)* %out, double addrspace(1)* %in) nounwind { + %load = load double addrspace(1)* %in, align 8 + %ext = fptrunc double %load to half + store half %ext, half addrspace(1)* %out, align 2 + ret void +}