diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -664,9 +664,6 @@ } } // End FPDPRounding = 1 -defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16_ARITH, add>; -defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16_ARITH, sub>; -defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16_ARITH, null_frag, "v_sub_u16">; defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16, mul>; defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum_like>; defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum_like>; @@ -675,6 +672,12 @@ defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16, umin>; defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16, smin>; +let SubtargetPredicate = isGFX8GFX9 in { + defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16_ARITH, add>; + defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16_ARITH, sub>; + defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16_ARITH, null_frag, "v_sub_u16">; +} + let Constraints = "$vdst = $src2", DisableEncoding="$src2", isConvertibleToThreeAddress = 1 in { defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>; @@ -857,7 +860,7 @@ >; } -let Predicates = [Has16BitInsts] in { +let Predicates = [Has16BitInsts, isGFX8GFX9] in { // Undo sub x, c -> add x, -c canonicalization since c is more likely // an inline immediate than -c. @@ -867,9 +870,6 @@ (V_SUB_U16_e64 VSrc_b16:$src0, NegSubInlineIntConst16:$src1) >; - -let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in { - def : GCNPat< (i32 (zext (add i16:$src0, (i16 NegSubInlineIntConst16:$src1)))), (V_SUB_U16_e64 VSrc_b16:$src0, NegSubInlineIntConst16:$src1) @@ -885,7 +885,10 @@ defm : Arithmetic_i16_0Hi_Pats; defm : Arithmetic_i16_0Hi_Pats; defm : Arithmetic_i16_0Hi_Pats; -} // End Predicates = [Has16BitInsts, isGFX7GFX8GFX9] + +} // End Predicates = [Has16BitInsts, isGFX8GFX9] + +let Predicates = [Has16BitInsts] in { def : ZExt_i16_i1_Pat; def : ZExt_i16_i1_Pat; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -610,6 +610,20 @@ def : PermlaneDiscardVDstIn< BoundControlOrFetchInvalidPermlane, V_PERMLANEX16_B32_e64>; + + defm V_ADD_NC_U16 : VOP3Inst <"v_add_nc_u16", VOP3_Profile, add>; + defm V_SUB_NC_U16 : VOP3Inst <"v_sub_nc_u16", VOP3_Profile, sub>; + + def : OpSelBinOpClampPat; + def : OpSelBinOpClampPat; + + // Undo sub x, c -> add x, -c canonicalization since c is more likely + // an inline immediate than -c. + def : GCNPat< + (add i16:$src0, (i16 NegSubInlineIntConst16:$src1)), + (V_SUB_NC_U16_e64 0, VSrc_b16:$src0, 0, NegSubInlineIntConst16:$src1, 0, 0) + >; + } // End SubtargetPredicate = isGFX10Plus class DivFmasPat : GCNPat< @@ -792,10 +806,11 @@ defm V_DIV_FIXUP_F16 : VOP3OpSel_Real_gfx10_with_name<0x35f, "V_DIV_FIXUP_F16_gfx9", "v_div_fixup_f16">; +defm V_ADD_NC_U16 : VOP3OpSel_Real_gfx10<0x303>; +defm V_SUB_NC_U16 : VOP3OpSel_Real_gfx10<0x304>; + // FIXME-GFX10-OPSEL: Need to add "selective" opsel support to some of these // (they do not support SDWA or DPP). -defm V_ADD_NC_U16 : VOP3_Real_gfx10_with_name<0x303, "V_ADD_U16", "v_add_nc_u16">; -defm V_SUB_NC_U16 : VOP3_Real_gfx10_with_name<0x304, "V_SUB_U16", "v_sub_nc_u16">; defm V_MUL_LO_U16 : VOP3_Real_gfx10_with_name<0x305, "V_MUL_LO_U16", "v_mul_lo_u16">; defm V_LSHRREV_B16 : VOP3_Real_gfx10_with_name<0x307, "V_LSHRREV_B16", "v_lshrrev_b16">; defm V_ASHRREV_I16 : VOP3_Real_gfx10_with_name<0x308, "V_ASHRREV_I16", "v_ashrrev_i16">; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir @@ -23,10 +23,11 @@ ; GFX6: S_ENDPGM 0, implicit [[V_ADD_U16_e64_]] ; GFX10-LABEL: name: add_s16 ; GFX10: liveins: $vgpr0, $vgpr1 - ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX10: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], [[COPY1]], 0, implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_ADD_U16_e64_]] + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_ADD_NC_U16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -54,11 +55,12 @@ ; GFX6: S_ENDPGM 0, implicit [[V_ADD_U16_e64_]] ; GFX10-LABEL: name: add_s16_zext_to_s32 ; GFX10: liveins: $vgpr0, $vgpr1 - ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX10: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], [[COPY1]], 0, implicit $exec - ; GFX10: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_U16_e64_]], 0, 16, implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_BFE_U32_e64_]] + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec + ; GFX10-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_NC_U16_e64_]], 0, 16, implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_BFE_U32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -86,9 +88,10 @@ ; GFX6: S_ENDPGM 0, implicit [[V_SUB_U16_e64_]] ; GFX10-LABEL: name: add_s16_neg_inline_const_64 ; GFX10: liveins: $vgpr0 - ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[COPY]], 64, 0, implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_SUB_U16_e64_]] + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[V_SUB_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_NC_U16_e64 0, [[COPY]], 0, 64, 0, 0, implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_SUB_NC_U16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_TRUNC %0 %2:vgpr(s16) = G_CONSTANT i16 -64 @@ -114,10 +117,11 @@ ; GFX6: S_ENDPGM 0, implicit [[V_SUB_U16_e64_]] ; GFX10-LABEL: name: add_s16_neg_inline_const_64_zext_to_s32 ; GFX10: liveins: $vgpr0 - ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[COPY]], 64, 0, implicit $exec - ; GFX10: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_SUB_U16_e64_]], 0, 16, implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_BFE_U32_e64_]] + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[V_SUB_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_NC_U16_e64 0, [[COPY]], 0, 64, 0, 0, implicit $exec + ; GFX10-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_SUB_NC_U16_e64_]], 0, 16, implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_BFE_U32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_TRUNC %0 %2:vgpr(s16) = G_CONSTANT i16 -64 diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s --- a/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s @@ -8785,6 +8785,21 @@ v_add_nc_u16 v5, v1, -4.0 clamp // GFX10: encoding: [0x05,0x80,0x03,0xd7,0x01,0xff,0x01,0x00,0x00,0xc4,0x00,0x00] +v_add_nc_u16 v5, v1, v2 op_sel:[1,1,1] +// GFX10: [0x05,0x58,0x03,0xd7,0x01,0x05,0x02,0x00] + +v_add_nc_u16 v5, v1, v2 op_sel:[0,0,0] +// GFX10: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00] + +v_add_nc_u16 v5, v1, v2 op_sel:[1,0,0] +// GFX10: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00] + +v_add_nc_u16 v5, v1, v2 op_sel:[0,1,0] +// GFX10: [0x05,0x10,0x03,0xd7,0x01,0x05,0x02,0x00] + +v_add_nc_u16 v5, v1, v2 op_sel:[0,0,1] +// GFX10: [0x05,0x40,0x03,0xd7,0x01,0x05,0x02,0x00] + v_sub_nc_u16 v5, v1, v2 // GFX10: encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00] @@ -8866,6 +8881,21 @@ v_sub_nc_u16 v5, v1, -4.0 clamp // GFX10: encoding: [0x05,0x80,0x04,0xd7,0x01,0xff,0x01,0x00,0x00,0xc4,0x00,0x00] +v_sub_nc_u16 v5, v1, v2 op_sel:[1,1,1] +// GFX10: [0x05,0x58,0x04,0xd7,0x01,0x05,0x02,0x00] + +v_sub_nc_u16 v5, v1, v2 op_sel:[0,0,0] +// GFX10: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00] + +v_sub_nc_u16 v5, v1, v2 op_sel:[1,0,0] +// GFX10: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00] + +v_sub_nc_u16 v5, v1, v2 op_sel:[0,1,0] +// GFX10: [0x05,0x10,0x04,0xd7,0x01,0x05,0x02,0x00] + +v_sub_nc_u16 v5, v1, v2 op_sel:[0,0,1] +// GFX10: [0x05,0x40,0x04,0xd7,0x01,0x05,0x02,0x00] + v_mul_lo_u16 v5, v1, v2 // GFX10: encoding: [0x05,0x00,0x05,0xd7,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt @@ -21350,6 +21350,18 @@ # GFX10: v_add_nc_u16 v5, vcc_lo, v2 clamp ; encoding: [0x05,0x80,0x03,0xd7,0x6a,0x04,0x02,0x00] 0x05,0x80,0x03,0xd7,0x6a,0x04,0x02,0x00 +# GFX10: v_add_nc_u16 v5, v1, v2 op_sel:[1,1,1] ; encoding: [0x05,0x58,0x03,0xd7,0x01,0x05,0x02,0x00] +0x05,0x58,0x03,0xd7,0x01,0x05,0x02,0x00 + +# GFX10: v_add_nc_u16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00] +0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00 + +# GFX10: v_add_nc_u16 v5, v1, v2 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0x01,0x05,0x02,0x00] +0x05,0x10,0x03,0xd7,0x01,0x05,0x02,0x00 + +# GFX10: v_add_nc_u16 v5, v1, v2 op_sel:[0,0,1] ; encoding: [0x05,0x40,0x03,0xd7,0x01,0x05,0x02,0x00] +0x05,0x40,0x03,0xd7,0x01,0x05,0x02,0x00 + # GFX10: v_add_nc_u32_e32 v255, v1, v2 ; encoding: [0x01,0x05,0xfe,0x4b] 0x01,0x05,0xfe,0x4b @@ -95535,6 +95547,18 @@ # GFX10: v_sub_nc_u16 v5, vcc_lo, v2 clamp ; encoding: [0x05,0x80,0x04,0xd7,0x6a,0x04,0x02,0x00] 0x05,0x80,0x04,0xd7,0x6a,0x04,0x02,0x00 +# GFX10: v_sub_nc_u16 v5, v1, v2 op_sel:[1,1,1] ; encoding: [0x05,0x58,0x04,0xd7,0x01,0x05,0x02,0x00] +0x05,0x58,0x04,0xd7,0x01,0x05,0x02,0x00 + +# GFX10: v_sub_nc_u16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00] +0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00 + +# GFX10: v_sub_nc_u16 v5, v1, v2 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0x01,0x05,0x02,0x00] +0x05,0x10,0x04,0xd7,0x01,0x05,0x02,0x00 + +# GFX10: v_sub_nc_u16 v5, v1, v2 op_sel:[0,0,1] ; encoding: [0x05,0x40,0x04,0xd7,0x01,0x05,0x02,0x00] +0x05,0x40,0x04,0xd7,0x01,0x05,0x02,0x00 + # GFX10: v_sub_nc_u32_e32 v255, v1, v2 ; encoding: [0x01,0x05,0xfe,0x4d] 0x01,0x05,0xfe,0x4d