Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -58,6 +58,7 @@ SDValue LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, bool Signed) const; SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -284,6 +284,8 @@ } setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); + // LLVM will turn only turn this to fp64->fp32->fp16 if unsafe math + setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom); const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; for (MVT VT : ScalarIntVTs) { @@ -717,6 +719,7 @@ case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); + case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG); case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, DAG); @@ -1947,6 +1950,21 @@ return SDValue(); } +SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, + SelectionDAG &DAG) const { + SDValue Src = Op.getOperand(0); + MVT SVT = Src.getSimpleValueType(); + SDLoc DL(Op); + + // F32 conversion is legal and should be selected + assert(SVT == MVT::f64); + // We don't necessarily know anything about the Src, so the flag is 0 + // TODO: Use known bits to determine flag + SDValue FloatVal = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, + DAG.getIntPtrConstant(0, DL)); + return DAG.getNode(ISD::FP_TO_FP16, DL, Op.getValueType(), FloatVal); +} + SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const { EVT ExtraVT = cast(Op.getOperand(1))->getVT(); Index: test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll =================================================================== --- test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll +++ test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll @@ -1,7 +1,8 @@ -; XFAIL: * -; RUN: llc -march=amdgcn -mcpu=SI < %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; GCN-LABEL: {{^}}global_truncstore_f64_to_f16: +; FUNC-LABEL: {{^}}global_truncstore_f64_to_f16: +; GCN: v_cvt_f32_f64_e32 [[VAL:v[0-9]*]] +; GCN: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VAL]] ; GCN: s_endpgm define void @global_truncstore_f64_to_f16(half addrspace(1)* %out, double addrspace(1)* %in) #0 { %val = load double, double addrspace(1)* %in @@ -10,7 +11,11 @@ ret void } -; GCN-LABEL: {{^}}global_truncstore_v2f64_to_v2f16: +; FUNC-LABEL: {{^}}global_truncstore_v2f64_to_v2f16: +; GCN: v_cvt_f32_f64_e32 [[VAL0:v[0-9]*]] +; GCN-DAG: v_cvt_f32_f64_e32 [[VAL1:v[0-9]*]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VAL0]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VAL1]] ; GCN: s_endpgm define void @global_truncstore_v2f64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 { %val = load <2 x double>, <2 x double> addrspace(1)* %in @@ -19,7 +24,13 @@ ret void } -; GCN-LABEL: {{^}}global_truncstore_v3f64_to_v3f16: +; FUNC-LABEL: {{^}}global_truncstore_v3f64_to_v3f16: +; GCN: v_cvt_f32_f64_e32 [[VAL0:v[0-9]*]] +; GCN-DAG: v_cvt_f32_f64_e32 [[VAL1:v[0-9]*]] +; GCN-DAG: v_cvt_f32_f64_e32 [[VAL2:v[0-9]*]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VAL0]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VAL1]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VAL2]] ; GCN: s_endpgm define void @global_truncstore_v3f64_to_v3f16(<3 x half> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 { %val = load <3 x double>, <3 x double> addrspace(1)* %in @@ -28,7 +39,15 @@ ret void } -; GCN-LABEL: {{^}}global_truncstore_v4f64_to_v4f16: +; FUNC-LABEL: {{^}}global_truncstore_v4f64_to_v4f16: +; GCN: v_cvt_f32_f64_e32 [[VAL0:v[0-9]*]] +; GCN-DAG: v_cvt_f32_f64_e32 [[VAL1:v[0-9]*]] +; GCN-DAG: v_cvt_f32_f64_e32 [[VAL2:v[0-9]*]] +; GCN-DAG: v_cvt_f32_f64_e32 [[VAL3:v[0-9]*]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VAL0]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VAL1]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VAL2]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VAL3]] ; GCN: s_endpgm define void @global_truncstore_v4f64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 { %val = load <4 x double>, <4 x double> addrspace(1)* %in @@ -37,7 +56,23 @@ ret void } -; GCN-LABEL: {{^}}global_truncstore_v8f64_to_v8f16: +; FUNC-LABEL: {{^}}global_truncstore_v8f64_to_v8f16: +; GCN: v_cvt_f32_f64_e32 [[VAL0:v[0-9]*]] +; GCN-DAG: v_cvt_f32_f64_e32 [[VAL1:v[0-9]*]] +; GCN-DAG: v_cvt_f32_f64_e32 [[VAL2:v[0-9]*]] +; GCN-DAG: v_cvt_f32_f64_e32 [[VAL3:v[0-9]*]] +; GCN-DAG: v_cvt_f32_f64_e32 [[VAL4:v[0-9]*]] +; GCN-DAG: v_cvt_f32_f64_e32 [[VAL5:v[0-9]*]] +; GCN-DAG: v_cvt_f32_f64_e32 [[VAL6:v[0-9]*]] +; GCN-DAG: v_cvt_f32_f64_e32 [[VAL7:v[0-9]*]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VAL0]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VAL1]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VAL2]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VAL3]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VAL4]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VAL5]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VAL6]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VAL7]] ; GCN: s_endpgm define void @global_truncstore_v8f64_to_v8f16(<8 x half> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 { %val = load <8 x double>, <8 x double> addrspace(1)* %in @@ -46,7 +81,39 @@ ret void } -; GCN-LABEL: {{^}}global_truncstore_v16f64_to_v16f16: +; FUNC-LABEL: {{^}}global_truncstore_v16f64_to_v16f16: +; GCN: v_cvt_f32_f64_e32 [[VAL0:v[0-9]*]] +; GCN-DAG: v_cvt_f32_f64_e32 [[VAL1:v[0-9]*]] +; GCN-DAG: v_cvt_f32_f64_e32 [[VAL2:v[0-9]*]] +; GCN-DAG: v_cvt_f32_f64_e32 [[VAL3:v[0-9]*]] +; GCN-DAG: v_cvt_f32_f64_e32 [[VAL4:v[0-9]*]] +; GCN-DAG: v_cvt_f32_f64_e32 [[VAL5:v[0-9]*]] +; GCN-DAG: v_cvt_f32_f64_e32 [[VAL6:v[0-9]*]] +; GCN-DAG: v_cvt_f32_f64_e32 [[VAL7:v[0-9]*]] +; GCN-DAG: v_cvt_f32_f64_e32 [[VAL8:v[0-9]*]] +; GCN-DAG: v_cvt_f32_f64_e32 [[VAL9:v[0-9]*]] +; GCN-DAG: v_cvt_f32_f64_e32 [[VALa:v[0-9]*]] +; GCN-DAG: v_cvt_f32_f64_e32 [[VALb:v[0-9]*]] +; GCN-DAG: v_cvt_f32_f64_e32 [[VALc:v[0-9]*]] +; GCN-DAG: v_cvt_f32_f64_e32 [[VALd:v[0-9]*]] +; GCN-DAG: v_cvt_f32_f64_e32 [[VALe:v[0-9]*]] +; GCN-DAG: v_cvt_f32_f64_e32 [[VALf:v[0-9]*]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VAL0]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VAL1]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VAL2]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VAL3]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VAL4]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VAL5]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VAL6]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VAL7]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VAL8]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VAL9]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VALa]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VALb]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VALc]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VALd]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VALe]] +; GCN-DAG: v_cvt_f16_f32_e32 {{v[0-9]*}}, [[VALf]] ; GCN: s_endpgm define void @global_truncstore_v16f64_to_v16f16(<16 x half> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 { %val = load <16 x double>, <16 x double> addrspace(1)* %in