Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -530,8 +530,8 @@ setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote); AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32); - setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote); - setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote); + setOperationAction(ISD::FP_TO_SINT, MVT::i16, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::i16, Legal); // F16 - Constant Actions. setOperationAction(ISD::ConstantFP, MVT::f16, Legal); Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -914,6 +914,48 @@ (V_CVT_F16_F32_e32 (V_CVT_F32_U32_e32 VSrc_b32:$src)) >; +def : GCNPat < + (i16 (fp_to_sint f16:$src)), + (V_CVT_I16_F16_e32 VSrc_b32:$src) +>; + +def : GCNPat < + (i16 (fp_to_uint f16:$src)), + (V_CVT_U16_F16_e32 VSrc_b32:$src) +>; + +def : GCNPat < + (i16 (fp_to_sint f32:$src)), + (V_CVT_I32_F32_e32 VSrc_b32:$src) +>; + +def : GCNPat < + (i16 (fp_to_uint f32:$src)), + (V_CVT_U32_F32_e32 VSrc_b32:$src) +>; + +def : GCNPat < + (i16 (fp_to_sint f64:$src)), + (V_CVT_I32_F64_e32 VReg_64:$src) +>; + +def : GCNPat < + (i16 (fp_to_uint f64:$src)), + (V_CVT_U32_F64_e32 VReg_64:$src) +>; + +let OtherPredicates = [HasSDWA] in { +def : GCNPat < + (i32 (sext (i16 (fp_to_sint f16:$src)))), + (V_LSHLREV_B32_e32 (i32 16), (V_CVT_I16_F16_e32 VSrc_b32:$src)) +>; + +def : GCNPat < + (i32 (sext (i16 (fp_to_uint f16:$src)))), + (V_LSHLREV_B32_e32 (i32 16), (V_CVT_U16_F16_e32 VSrc_b32:$src)) +>; +} // OtherPredicates = [HasSDWA] + //===----------------------------------------------------------------------===// // VOP2 Patterns //===----------------------------------------------------------------------===// Index: llvm/test/CodeGen/AMDGPU/fp_to_uint.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fp_to_uint.ll +++ llvm/test/CodeGen/AMDGPU/fp_to_uint.ll @@ -240,11 +240,8 @@ } ; FUNC-LABEL: {{^}}fp_to_uint_f32_to_i16: -; The reason different instructions are used on SI and VI is because for -; SI fp_to_uint is legalized by the type legalizer and for VI it is -; legalized by the dag legalizer and they legalize fp_to_uint differently. ; SI: v_cvt_u32_f32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}} -; VI: v_cvt_i32_f32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}} +; VI: v_cvt_u32_f32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}} ; GCN: buffer_store_short [[VAL]] define amdgpu_kernel void @fp_to_uint_f32_to_i16(i16 addrspace(1)* %out, float %in) #0 { %uint = fptoui float %in to i16 Index: llvm/test/CodeGen/AMDGPU/fptosi.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fptosi.f16.ll +++ llvm/test/CodeGen/AMDGPU/fptosi.f16.ll @@ -1,10 +1,16 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=VI %s ; GCN-LABEL: {{^}}fptosi_f16_to_i16 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] + ; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; GCN: v_cvt_i32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]] + +; VI: buffer_load_ushort v[[A_F16:[0-9]+]] +; VI: v_cvt_i16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]] +; VI: buffer_store_short v[[R_I16]] + ; GCN: buffer_store_short v[[R_I16]] ; GCN: s_endpgm define amdgpu_kernel void @fptosi_f16_to_i16( @@ -65,11 +71,11 @@ ; SI: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]] ; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_LO]], v[[R_I16_HI]] -; VI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; VI: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI: v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]] -; VI: v_cvt_i32_f32_sdwa v[[R_I16_1:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI: v_or_b32_sdwa v[[R_V2_I16:[0-9]+]], v[[R_I16_0]], v[[R_I16_1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; VI: v_cvt_i16_f16_e32 v[[R_I16_LO:[0-9]+]], v[[A_V2_F16]] +; VI: v_cvt_i16_f16_sdwa v[[A_I32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI: v_or_b32_sdwa v[[R_V2_I16:[0-9]+]], v[[R_I16_LO]], v[[A_I32_1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI: buffer_store_dword v[[R_V2_I16]] ; GCN: buffer_store_dword v[[R_V2_I16]] ; GCN: s_endpgm @@ -88,7 +94,12 @@ ; GCN: buffer_load_dword ; GCN: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 + +; VI: v_cvt_f32_f16_e32 ; VI: v_cvt_f32_f16_sdwa +; VI: v_cvt_i32_f32_e32 +; VI: v_cvt_i32_f32_e32 + ; GCN: v_cvt_i32_f32_e32 ; GCN: v_cvt_i32_f32_e32 ; GCN: buffer_store_dwordx2 @@ -108,6 +119,8 @@ ; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i64 ; GCN: buffer_load_dword v[[A_F16_0:[0-9]+]] +; VI: buffer_load_dword v[[A_F16_0:[0-9]+]] + ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] @@ -115,12 +128,15 @@ ; SI: v_ashrrev_i32_e32 v[[R_I64_0_High:[0-9]+]], 31, v[[R_I64_0_Low]] ; SI: v_cvt_i32_f32_e32 v[[R_I64_1_Low:[0-9]+]], v[[A_F32_1]] ; SI: v_ashrrev_i32_e32 v[[R_I64_1_High:[0-9]+]], 31, v[[R_I64_1_Low]] -; VI: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]] -; VI: v_cvt_i32_f32_e32 v[[R_I64_1_Low:[0-9]+]], v[[A_F32_1]] + +; VI: v_cvt_f32_f16_sdwa v[[A_F32_0:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_0]] ; VI: v_cvt_i32_f32_e32 v[[R_I64_0_Low:[0-9]+]], v[[A_F32_0]] -; VI: v_ashrrev_i32_e32 v[[R_I64_1_High:[0-9]+]], 31, v[[R_I64_1_Low]] +; VI: v_cvt_i32_f32_e32 v[[R_I64_1_Low:[0-9]+]], v[[A_F32_1]] ; VI: v_ashrrev_i32_e32 v[[R_I64_0_High:[0-9]+]], 31, v[[R_I64_0_Low]] +; VI: v_ashrrev_i32_e32 v[[R_I64_1_High:[0-9]+]], 31, v[[R_I64_1_Low]] +; VI: buffer_store_dwordx4 v{{\[}}[[R_I64_1_Low]]{{\:}}[[R_I64_0_High]]{{\]}} + ; GCN: buffer_store_dwordx4 v{{\[}}[[R_I64_0_Low]]{{\:}}[[R_I64_1_High]]{{\]}} ; GCN: s_endpgm define amdgpu_kernel void @fptosi_v2f16_to_v2i64( Index: llvm/test/CodeGen/AMDGPU/fptoui.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fptoui.f16.ll +++ llvm/test/CodeGen/AMDGPU/fptoui.f16.ll @@ -1,11 +1,15 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=VI %s ; GCN-LABEL: {{^}}fptoui_f16_to_i16 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; VI: buffer_load_ushort v[[A_F16:[0-9]+]] ; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_cvt_u32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]] -; VI: v_cvt_i32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]] + +; VI: v_cvt_u16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]] +; VI: buffer_store_short v[[R_I16]] + ; GCN: buffer_store_short v[[R_I16]] ; GCN: s_endpgm define amdgpu_kernel void @fptoui_f16_to_i16( @@ -56,23 +60,23 @@ ; GCN-LABEL: {{^}}fptoui_v2f16_to_v2i16 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI: v_cvt_u32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]] -; SI: v_cvt_u32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]] +; SI: v_cvt_u32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]] +; SI: v_cvt_u32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]] ; SI: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]] ; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_0]], v[[R_I16_HI]] -; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] -; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI: v_cvt_i32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]] -; VI: v_cvt_i32_f32_sdwa v[[R_I16_0:[0-9]+]], v[[A_F32_0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI: v_or_b32_sdwa v[[R_V2_I16:[0-9]+]], v[[R_I16_1]], v[[R_I16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI: v_cvt_u16_f16_e32 v[[A_F16_1:[0-9]+]], v[[A_V2_F16]] +; VI: v_cvt_u16_f16_sdwa v[[A_F16_0:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI: v_or_b32_sdwa v[[A_V2_U16:[0-9]+]], v[[A_F16_1]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN: buffer_store_dword v[[R_V2_I16]] -; GCN: s_endpgm +; VI: buffer_store_dword v[[A_V2_U16]] +; GCN: buffer_store_dword v[[R_V2_I16]] +; GCN: s_endpgm define amdgpu_kernel void @fptoui_v2f16_to_v2i16( <2 x i16> addrspace(1)* %r, @@ -108,6 +112,7 @@ ; GCN-LABEL: {{^}}fptoui_v2f16_to_v2i64 ; GCN: buffer_load_dword v[[A_F16_0:[0-9]+]] +; VI: buffer_load_dword v[[A_F16_0:[0-9]+]] ; GCN: v_mov_b32_e32 v[[R_I64_1_High:[0-9]+]], 0 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]]