Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -3225,10 +3225,10 @@ DAG.getNode(ISD::FP_EXTEND, dl, Node->getValueType(0), Res)); } break; - case ISD::FP_TO_FP16: + case ISD::FP_TO_FP16: { + SDValue Op = Node->getOperand(0); + MVT SVT = Op.getSimpleValueType(); if (!TLI.useSoftFloat() && TM.Options.UnsafeFPMath) { - SDValue Op = Node->getOperand(0); - MVT SVT = Op.getSimpleValueType(); if ((SVT == MVT::f64 || SVT == MVT::f80) && TLI.isOperationLegalOrCustom(ISD::FP_TO_FP16, MVT::f32)) { // Under fastmath, we can expand this node into a fround followed by @@ -3239,7 +3239,94 @@ DAG.getNode(ISD::FP_TO_FP16, dl, Node->getValueType(0), FloatVal)); } } + + if (SVT == MVT::f64) { + // f64 -> f16 conversion using round-to-nearest-even rounding mode. + const unsigned ExpMask = 0x7ff; + const unsigned ExpBiasf64 = 1023; + const unsigned ExpBiasf16 = 15; + SDValue Zero = DAG.getConstant(0, dl, MVT::i32); + SDValue One = DAG.getConstant(1, dl, MVT::i32); + SDValue U = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Op); + SDValue UH = DAG.getNode(ISD::SRL, dl, MVT::i64, U, + DAG.getConstant(32, dl, MVT::i64)); + UH = DAG.getZExtOrTrunc(UH, dl, MVT::i32); + U = DAG.getZExtOrTrunc(U, dl, MVT::i32); + SDValue E = DAG.getNode(ISD::SRL, dl, MVT::i32, UH, + DAG.getConstant(20, dl, MVT::i64)); + E = DAG.getNode(ISD::AND, dl, MVT::i32, E, + DAG.getConstant(ExpMask, dl, MVT::i32)); + // Subtract the fp64 exponent bias (1023) to get the real exponent and + // add the f16 bias (15) to get the biased exponent for the f16 format. + E = DAG.getNode(ISD::ADD, dl, MVT::i32, E, + DAG.getConstant(-ExpBiasf64 + ExpBiasf16, dl, MVT::i32)); + + SDValue M = DAG.getNode(ISD::SRL, dl, MVT::i32, UH, + DAG.getConstant(8, dl, MVT::i32)); + M = DAG.getNode(ISD::AND, dl, MVT::i32, M, + DAG.getConstant(0xffe, dl, MVT::i32)); + + SDValue MaskedSig = DAG.getNode(ISD::AND, dl, MVT::i32, UH, + DAG.getConstant(0x1ff, dl, MVT::i32)); + MaskedSig = DAG.getNode(ISD::OR, dl, MVT::i32, MaskedSig, U); + + SDValue Lo40Set = DAG.getSelectCC(dl, MaskedSig, Zero, + Zero, One, ISD::SETEQ); + M = DAG.getNode(ISD::OR, dl, MVT::i32, M, Lo40Set); + + // (M != 0 ? 0x0200 : 0) | 0x7c00; + SDValue I = DAG.getNode(ISD::OR, dl, MVT::i32, + DAG.getSelectCC(dl, M, Zero, DAG.getConstant(0x0200, dl, MVT::i32), + Zero, ISD::SETNE), DAG.getConstant(0x7c00, dl, MVT::i32)); + + // N = M | (E << 12); + SDValue N = DAG.getNode(ISD::OR, dl, MVT::i32, M, + DAG.getNode(ISD::SHL, dl, MVT::i32, E, + DAG.getConstant(12, dl, MVT::i32))); + + // B = clamp(1-E, 0, 13); + SDValue OneSubExp = DAG.getNode(ISD::SUB, dl, MVT::i32, + One, E); + SDValue B = DAG.getNode(ISD::SMAX, dl, MVT::i32, OneSubExp, Zero); + B = DAG.getNode(ISD::SMIN, dl, MVT::i32, B, + DAG.getConstant(13, dl, MVT::i32)); + + SDValue SigSetHigh = DAG.getNode(ISD::OR, dl, MVT::i32, M, + DAG.getConstant(0x1000, dl, MVT::i32)); + + SDValue D = DAG.getNode(ISD::SRL, dl, MVT::i32, SigSetHigh, B); + SDValue D0 = DAG.getNode(ISD::SHL, dl, MVT::i32, D, B); + SDValue D1 = DAG.getSelectCC(dl, D0, SigSetHigh, One, Zero, ISD::SETNE); + D = DAG.getNode(ISD::OR, dl, MVT::i32, D, D1); + + SDValue V = DAG.getSelectCC(dl, E, One, D, N, ISD::SETLT); + SDValue VLow3 = DAG.getNode(ISD::AND, dl, MVT::i32, V, + DAG.getConstant(0x7, dl, MVT::i32)); + V = DAG.getNode(ISD::SRL, dl, MVT::i32, V, + DAG.getConstant(2, dl, MVT::i32)); + SDValue V0 = DAG.getSelectCC(dl, VLow3, DAG.getConstant(3, dl, MVT::i32), + One, Zero, ISD::SETEQ); + SDValue V1 = DAG.getSelectCC(dl, VLow3, DAG.getConstant(5, dl, MVT::i32), + One, Zero, ISD::SETGT); + V1 = DAG.getNode(ISD::OR, dl, MVT::i32, V0, V1); + V = DAG.getNode(ISD::ADD, dl, MVT::i32, V, V1); + + V = DAG.getSelectCC(dl, E, DAG.getConstant(30, dl, MVT::i32), + DAG.getConstant(0x7c00, dl, MVT::i32), V, ISD::SETGT); + V = DAG.getSelectCC(dl, E, DAG.getConstant(1039, dl, MVT::i32), + I, V, ISD::SETEQ); + + // Extract the sign bit. + SDValue Sign = DAG.getNode(ISD::SRL, dl, MVT::i32, UH, + DAG.getConstant(16, dl, MVT::i32)); + Sign = DAG.getNode(ISD::AND, dl, MVT::i32, Sign, + DAG.getConstant(0x8000, dl, MVT::i32)); + + V = DAG.getNode(ISD::OR, dl, MVT::i32, Sign, V); + Results.push_back(DAG.getZExtOrTrunc(V, dl, Node->getValueType(0))); + } break; + } case ISD::ConstantFP: { ConstantFPSDNode *CFP = cast(Node); // Check to see if this FP immediate is already legal. Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -114,6 +114,7 @@ setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand); setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); Index: test/CodeGen/AMDGPU/fptrunc.ll =================================================================== --- test/CodeGen/AMDGPU/fptrunc.ll +++ test/CodeGen/AMDGPU/fptrunc.ll @@ -1,17 +1,29 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=GCN -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-UNSAFE %s ; FUNC-LABEL: {{^}}fptrunc_f64_to_f32: -; SI: v_cvt_f32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} +; GCN: v_cvt_f32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} define void @fptrunc_f64_to_f32(float addrspace(1)* %out, double %in) { %result = fptrunc double %in to float store float %result, float addrspace(1)* %out ret void } +; FUNC-LABEL: {{^}}fptrunc_f64_to_f16: +; GCN-NOT: v_cvt +; GCN-FAST: v_cvt_f32_f64_e32 [[F32:v[0-9]+]] +; GCN-FAST: v_cvt_f16_f32_e32 v[0-9]+, [[F32]] +define void @fptrunc_f64_to_f16(i16 addrspace(1)* %out, double %in) { + %result = fptrunc double %in to half + %result_i16 = bitcast half %result to i16 + store i16 %result_i16, i16 addrspace(1)* %out + ret void +} + ; FUNC-LABEL: {{^}}fptrunc_v2f64_to_v2f32: -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 +; GCN: v_cvt_f32_f64_e32 +; GCN: v_cvt_f32_f64_e32 define void @fptrunc_v2f64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x double> %in) { %result = fptrunc <2 x double> %in to <2 x float> store <2 x float> %result, <2 x float> addrspace(1)* %out @@ -19,10 +31,10 @@ } ; FUNC-LABEL: {{^}}fptrunc_v4f64_to_v4f32: -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 +; GCN: v_cvt_f32_f64_e32 +; GCN: v_cvt_f32_f64_e32 +; GCN: v_cvt_f32_f64_e32 +; GCN: v_cvt_f32_f64_e32 define void @fptrunc_v4f64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x double> %in) { %result = fptrunc <4 x double> %in to <4 x float> store <4 x float> %result, <4 x float> addrspace(1)* %out @@ -30,14 +42,14 @@ } ; FUNC-LABEL: {{^}}fptrunc_v8f64_to_v8f32: -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 +; GCN: v_cvt_f32_f64_e32 +; GCN: v_cvt_f32_f64_e32 +; GCN: v_cvt_f32_f64_e32 +; GCN: v_cvt_f32_f64_e32 +; GCN: v_cvt_f32_f64_e32 +; GCN: v_cvt_f32_f64_e32 +; GCN: v_cvt_f32_f64_e32 +; GCN: v_cvt_f32_f64_e32 define void @fptrunc_v8f64_to_v8f32(<8 x float> addrspace(1)* %out, <8 x double> %in) { %result = fptrunc <8 x double> %in to <8 x float> store <8 x float> %result, <8 x float> addrspace(1)* %out Index: test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll =================================================================== --- test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll +++ test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll @@ -1,4 +1,3 @@ -; XFAIL: * ; RUN: llc -march=amdgcn -mcpu=SI < %s ; GCN-LABEL: {{^}}global_truncstore_f64_to_f16: