diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2702,14 +2702,12 @@ // TODO: Factor out code common with LowerFP_TO_UINT. EVT SrcVT = Src.getValueType(); - if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) { + if (SrcVT == MVT::f16 || + (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) { SDLoc DL(Op); - SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src); - SDValue FpToInt32 = - DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend); - - return FpToInt32; + SDValue FpToInt32 = DAG.getNode(Op.getOpcode(), DL, MVT::i32, Src); + return DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, FpToInt32); } if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) @@ -2725,14 +2723,12 @@ // TODO: Factor out code common with LowerFP_TO_SINT. EVT SrcVT = Src.getValueType(); - if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) { + if (SrcVT == MVT::f16 || + (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) { SDLoc DL(Op); - SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src); - SDValue FpToInt32 = - DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend); - - return FpToInt32; + SDValue FpToUInt32 = DAG.getNode(Op.getOpcode(), DL, MVT::i32, Src); + return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, FpToUInt32); } if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll @@ -37,8 +37,11 @@ ; test checks code generated for 'i64 = fp_to_sint f32'. ; GCN-LABEL: {{^}}fptosi_f16_to_i64 -; GCN: buffer_load_ushort -; GCN: v_cvt_f32_f16_e32 +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; GCN: v_cvt_i32_f32_e32 v[[R_I64_Low:[0-9]+]], v[[A_F32]] +; GCN: v_ashrrev_i32_e32 v[[R_I64_High:[0-9]+]], 31, v[[R_I64_Low]] +; GCN: buffer_store_dwordx2 v{{\[}}[[R_I64_Low]]{{\:}}[[R_I64_High]]{{\]}} ; GCN: s_endpgm define amdgpu_kernel void @fptosi_f16_to_i64( i64 addrspace(1)* %r, @@ -104,10 +107,21 @@ ; test checks code generated for 'i64 = fp_to_sint f32'. ; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i64 -; GCN: buffer_load_dword -; GCN: v_cvt_f32_f16_e32 -; SI: v_cvt_f32_f16_e32 -; VI: v_cvt_f32_f16_sdwa +; GCN: buffer_load_dword v[[A_F16_0:[0-9]+]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_i32_f32_e32 v[[R_I64_0_Low:[0-9]+]], v[[A_F32_0]] +; SI: v_ashrrev_i32_e32 v[[R_I64_0_High:[0-9]+]], 31, v[[R_I64_0_Low]] +; SI: v_cvt_i32_f32_e32 v[[R_I64_1_Low:[0-9]+]], v[[A_F32_1]] +; SI: v_ashrrev_i32_e32 v[[R_I64_1_High:[0-9]+]], 31, v[[R_I64_1_Low]] +; VI: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]] +; VI: v_cvt_i32_f32_e32 v[[R_I64_1_Low:[0-9]+]], v[[A_F32_1]] +; VI: v_cvt_i32_f32_e32 v[[R_I64_0_Low:[0-9]+]], v[[A_F32_0]] +; VI: v_ashrrev_i32_e32 v[[R_I64_1_High:[0-9]+]], 31, v[[R_I64_1_Low]] +; VI: v_ashrrev_i32_e32 v[[R_I64_0_High:[0-9]+]], 31, v[[R_I64_0_Low]] +; GCN: buffer_store_dwordx4 v{{\[}}[[R_I64_0_Low]]{{\:}}[[R_I64_1_High]]{{\]}} ; GCN: s_endpgm define amdgpu_kernel void @fptosi_v2f16_to_v2i64( <2 x i64> addrspace(1)* %r, diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll @@ -38,8 +38,11 @@ ; test checks code generated for 'i64 = fp_to_uint f32'. ; GCN-LABEL: {{^}}fptoui_f16_to_i64 -; GCN: buffer_load_ushort -; GCN: v_cvt_f32_f16_e32 +; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: v_mov_b32_e32 v[[R_I64_High:[0-9]+]], 0 +; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; GCN: v_cvt_u32_f32_e32 v[[R_I64_Low:[0-9]+]], v[[A_F32]] +; GCN: buffer_store_dwordx2 v{{\[}}[[R_I64_Low]]{{\:}}[[R_I64_High]]{{\]}} ; GCN: s_endpgm define amdgpu_kernel void @fptoui_f16_to_i64( i64 addrspace(1)* %r, @@ -104,10 +107,19 @@ ; test checks code generated for 'i64 = fp_to_uint f32'. ; GCN-LABEL: {{^}}fptoui_v2f16_to_v2i64 -; GCN: buffer_load_dword -; GCN: v_cvt_f32_f16_e32 -; SI: v_cvt_f32_f16_e32 -; VI: v_cvt_f32_f16_sdwa +; GCN: buffer_load_dword v[[A_F16_0:[0-9]+]] +; GCN: v_mov_b32_e32 v[[R_I64_1_High:[0-9]+]], 0 +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_u32_f32_e32 v[[R_I64_0_Low:[0-9]+]], v[[A_F32_0]] +; SI: v_cvt_u32_f32_e32 v[[R_I64_1_Low:[0-9]+]], v[[A_F32_1]] +; VI: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]] +; VI: v_cvt_u32_f32_e32 v[[R_I64_1_Low:[0-9]+]], v[[A_F32_1]] +; VI: v_cvt_u32_f32_e32 v[[R_I64_0_Low:[0-9]+]], v[[A_F32_0]] +; GCN: v_mov_b32_e32 v[[R_I64_0_High:[0-9]+]], 0 +; GCN: buffer_store_dwordx4 v{{\[}}[[R_I64_0_Low]]{{\:}}[[R_I64_1_High]]{{\]}} ; GCN: s_endpgm define amdgpu_kernel void @fptoui_v2f16_to_v2i64( <2 x i64> addrspace(1)* %r,