Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -66,8 +66,7 @@ SDValue LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, bool Signed) const; SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1272,8 +1272,9 @@ case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG); - case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); - case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + return LowerFP_TO_INT(Op, DAG); case ISD::CTTZ: case ISD::CTTZ_ZERO_UNDEF: case ISD::CTLZ: @@ -2722,44 +2723,40 @@ return DAG.getZExtOrTrunc(V, DL, Op.getValueType()); } -SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op, - SelectionDAG &DAG) const { +SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op, + SelectionDAG &DAG) const { SDValue Src = Op.getOperand(0); - - // TODO: Factor out code common with LowerFP_TO_UINT. - + unsigned OpOpcode = Op.getOpcode(); EVT SrcVT = Src.getValueType(); - if (SrcVT == MVT::f16 || - (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) { - SDLoc DL(Op); + EVT DestVT = Op.getValueType(); - SDValue FpToInt32 = DAG.getNode(Op.getOpcode(), DL, MVT::i32, Src); - return DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, FpToInt32); + // Will be selected natively + if (SrcVT == MVT::f16) { + if (DestVT == MVT::i16) + return Op; } - if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) - return LowerFP64_TO_INT(Op, DAG, true); - - return SDValue(); -} - -SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op, - SelectionDAG &DAG) const { - SDValue Src = Op.getOperand(0); + // Promote i16 to i32 + if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) { + SDLoc DL(Op); - // TODO: Factor out code common with LowerFP_TO_SINT. + SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src); + return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32); + } - EVT SrcVT = Src.getValueType(); if (SrcVT == MVT::f16 || (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) { SDLoc DL(Op); - SDValue FpToUInt32 = DAG.getNode(Op.getOpcode(), DL, MVT::i32, Src); - return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, FpToUInt32); + SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src); + unsigned Ext = + OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + return DAG.getNode(Ext, DL, MVT::i64, FpToInt32); } - if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) - return LowerFP64_TO_INT(Op, DAG, false); + if (DestVT == MVT::i64 && SrcVT == MVT::f64) + return LowerFP64_TO_INT(Op, DAG, + OpOpcode == ISD::FP_TO_SINT ? true : false); return SDValue(); } Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -530,8 +530,8 @@ setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote); AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32); - setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote); - setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote); + setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i16, Custom); // F16 - Constant Actions. setOperationAction(ISD::ConstantFP, MVT::f16, Legal); @@ -4522,6 +4522,9 @@ return lowerFMINNUM_FMAXNUM(Op, DAG); case ISD::FMA: return splitTernaryVectorOp(Op, DAG); + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + return LowerFP_TO_INT(Op, DAG); case ISD::SHL: case ISD::SRA: case ISD::SRL: Index: llvm/test/CodeGen/AMDGPU/fp_to_uint.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fp_to_uint.ll +++ llvm/test/CodeGen/AMDGPU/fp_to_uint.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=GCN,FUNC,SI -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=GCN,FUNC,VI +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=GCN,FUNC +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=GCN,FUNC ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=EG -check-prefix=FUNC declare float @llvm.fabs.f32(float) #1 @@ -240,11 +240,7 @@ } ; FUNC-LABEL: {{^}}fp_to_uint_f32_to_i16: -; The reason different instructions are used on SI and VI is because for -; SI fp_to_uint is legalized by the type legalizer and for VI it is -; legalized by the dag legalizer and they legalize fp_to_uint differently. -; SI: v_cvt_u32_f32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}} -; VI: v_cvt_i32_f32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}} +; GCN: v_cvt_u32_f32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}} ; GCN: buffer_store_short [[VAL]] define amdgpu_kernel void @fp_to_uint_f32_to_i16(i16 addrspace(1)* %out, float %in) #0 { %uint = fptoui float %in to i16 Index: llvm/test/CodeGen/AMDGPU/fptosi.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fptosi.f16.ll +++ llvm/test/CodeGen/AMDGPU/fptosi.f16.ll @@ -3,8 +3,9 @@ ; GCN-LABEL: {{^}}fptosi_f16_to_i16 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; GCN: v_cvt_i32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_i32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]] +; VI: v_cvt_i16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_short v[[R_I16]] ; GCN: s_endpgm define amdgpu_kernel void @fptosi_f16_to_i16( @@ -65,11 +66,9 @@ ; SI: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]] ; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_LO]], v[[R_I16_HI]] -; VI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; VI: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI: v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]] -; VI: v_cvt_i32_f32_sdwa v[[R_I16_1:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI: v_or_b32_sdwa v[[R_V2_I16:[0-9]+]], v[[R_I16_0]], v[[R_I16_1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI: v_cvt_i16_f16_e32 v[[A_I16_0:[0-9]+]], v[[A_V2_F16]] +; VI: v_cvt_i16_f16_sdwa v[[A_I16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI: v_or_b32_sdwa v[[R_V2_I16:[0-9]+]], v[[A_I16_0]], v[[A_I16_1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN: buffer_store_dword v[[R_V2_I16]] ; GCN: s_endpgm Index: llvm/test/CodeGen/AMDGPU/fptoui.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fptoui.f16.ll +++ llvm/test/CodeGen/AMDGPU/fptoui.f16.ll @@ -3,9 +3,9 @@ ; GCN-LABEL: {{^}}fptoui_f16_to_i16 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_u32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]] -; VI: v_cvt_i32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]] +; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] +; SI: v_cvt_u32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]] +; VI: v_cvt_u16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_short v[[R_I16]] ; GCN: s_endpgm define amdgpu_kernel void @fptoui_f16_to_i16( @@ -65,11 +65,9 @@ ; SI: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]] ; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_0]], v[[R_I16_HI]] -; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] -; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI: v_cvt_i32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]] -; VI: v_cvt_i32_f32_sdwa v[[R_I16_0:[0-9]+]], v[[A_F32_0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI: v_or_b32_sdwa v[[R_V2_I16:[0-9]+]], v[[R_I16_1]], v[[R_I16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI: v_cvt_u16_f16_e32 v[[A_U16_1:[0-9]+]], v[[A_V2_F16]] +; VI: v_cvt_u16_f16_sdwa v[[R_U16_0:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI: v_or_b32_sdwa v[[R_V2_I16:[0-9]+]], v[[A_U16_1]], v[[R_U16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN: buffer_store_dword v[[R_V2_I16]] ; GCN: s_endpgm