Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -53,6 +53,7 @@ SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, bool Signed) const; + SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -279,6 +279,7 @@ } setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom); const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; for (MVT VT : ScalarIntVTs) { @@ -806,6 +807,7 @@ case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); + case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); case ISD::CTLZ: @@ -1959,6 +1961,102 @@ return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result); } +SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const { + + if (getTargetMachine().Options.UnsafeFPMath) { + // There is a generic expand for FP_TO_FP16 with unsafe fast math. + return SDValue(); + } + + SDLoc DL(Op); + SDValue N0 = Op.getOperand(0); + MVT SVT = N0.getSimpleValueType(); + assert(SVT == MVT::f64); + + // f64 -> f16 conversion using round-to-nearest-even rounding mode. + const unsigned ExpMask = 0x7ff; + const unsigned ExpBiasf64 = 1023; + const unsigned ExpBiasf16 = 15; + SDValue Zero = DAG.getConstant(0, DL, MVT::i32); + SDValue One = DAG.getConstant(1, DL, MVT::i32); + SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0); + SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U, + DAG.getConstant(32, DL, MVT::i64)); + UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32); + U = DAG.getZExtOrTrunc(U, DL, MVT::i32); + SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH, + DAG.getConstant(20, DL, MVT::i64)); + E = DAG.getNode(ISD::AND, DL, MVT::i32, E, + DAG.getConstant(ExpMask, DL, MVT::i32)); + // Subtract the fp64 exponent bias (1023) to get the real exponent and + // add the f16 bias (15) to get the biased exponent for the f16 format. + E = DAG.getNode(ISD::ADD, DL, MVT::i32, E, + DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32)); + + SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH, + DAG.getConstant(8, DL, MVT::i32)); + M = DAG.getNode(ISD::AND, DL, MVT::i32, M, + DAG.getConstant(0xffe, DL, MVT::i32)); + + SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH, + DAG.getConstant(0x1ff, DL, MVT::i32)); + MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U); + + SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ); + M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set); + + // (M != 0 ? 0x0200 : 0) | 0x7c00; + SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32, + DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32), + Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32)); + + // N = M | (E << 12); + SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M, + DAG.getNode(ISD::SHL, DL, MVT::i32, E, + DAG.getConstant(12, DL, MVT::i32))); + + // B = clamp(1-E, 0, 13); + SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32, + One, E); + SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero); + B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B, + DAG.getConstant(13, DL, MVT::i32)); + + SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M, + DAG.getConstant(0x1000, DL, MVT::i32)); + + SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B); + SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B); + SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE); + D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1); + + SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT); + SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V, + DAG.getConstant(0x7, DL, MVT::i32)); + V = DAG.getNode(ISD::SRL, DL, MVT::i32, V, + DAG.getConstant(2, DL, MVT::i32)); + SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32), + One, Zero, ISD::SETEQ); + SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32), + One, Zero, ISD::SETGT); + V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1); + V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1); + + V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32), + DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT); + V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32), + I, V, ISD::SETEQ); + + // Extract the sign bit. + SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH, + DAG.getConstant(16, DL, MVT::i32)); + Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign, + DAG.getConstant(0x8000, DL, MVT::i32)); + + V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V); + return DAG.getZExtOrTrunc(V, DL, Op.getValueType()); +} + SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const { SDValue Src = Op.getOperand(0); Index: llvm/trunk/test/CodeGen/AMDGPU/fptrunc.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/fptrunc.ll +++ llvm/trunk/test/CodeGen/AMDGPU/fptrunc.ll @@ -1,17 +1,29 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-UNSAFE %s ; FUNC-LABEL: {{^}}fptrunc_f64_to_f32: -; SI: v_cvt_f32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} +; GCN: v_cvt_f32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} define void @fptrunc_f64_to_f32(float addrspace(1)* %out, double %in) { %result = fptrunc double %in to float store float %result, float addrspace(1)* %out ret void } +; FUNC-LABEL: {{^}}fptrunc_f64_to_f16: +; GCN-NOT: v_cvt +; GCN-FAST: v_cvt_f32_f64_e32 [[F32:v[0-9]+]] +; GCN-FAST: v_cvt_f16_f32_e32 v[0-9]+, [[F32]] +define void @fptrunc_f64_to_f16(i16 addrspace(1)* %out, double %in) { + %result = fptrunc double %in to half + %result_i16 = bitcast half %result to i16 + store i16 %result_i16, i16 addrspace(1)* %out + ret void +} + ; FUNC-LABEL: {{^}}fptrunc_v2f64_to_v2f32: -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 +; GCN: v_cvt_f32_f64_e32 +; GCN: v_cvt_f32_f64_e32 define void @fptrunc_v2f64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x double> %in) { %result = fptrunc <2 x double> %in to <2 x float> store <2 x float> %result, <2 x float> addrspace(1)* %out @@ -19,10 +31,10 @@ } ; FUNC-LABEL: {{^}}fptrunc_v4f64_to_v4f32: -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 +; GCN: v_cvt_f32_f64_e32 +; GCN: v_cvt_f32_f64_e32 +; GCN: v_cvt_f32_f64_e32 +; GCN: v_cvt_f32_f64_e32 define void @fptrunc_v4f64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x double> %in) { %result = fptrunc <4 x double> %in to <4 x float> store <4 x float> %result, <4 x float> addrspace(1)* %out @@ -30,14 +42,14 @@ } ; FUNC-LABEL: {{^}}fptrunc_v8f64_to_v8f32: -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 +; GCN: v_cvt_f32_f64_e32 +; GCN: v_cvt_f32_f64_e32 +; GCN: v_cvt_f32_f64_e32 +; GCN: v_cvt_f32_f64_e32 +; GCN: v_cvt_f32_f64_e32 +; GCN: v_cvt_f32_f64_e32 +; GCN: v_cvt_f32_f64_e32 +; GCN: v_cvt_f32_f64_e32 define void @fptrunc_v8f64_to_v8f32(<8 x float> addrspace(1)* %out, <8 x double> %in) { %result = fptrunc <8 x double> %in to <8 x float> store <8 x float> %result, <8 x float> addrspace(1)* %out Index: llvm/trunk/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll @@ -1,5 +1,4 @@ -; XFAIL: * -; RUN: llc -march=amdgcn -mcpu=SI < %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}global_truncstore_f64_to_f16: ; GCN: s_endpgm