Index: lib/Target/AMDGPU/VOP2Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP2Instructions.td +++ lib/Target/AMDGPU/VOP2Instructions.td @@ -608,9 +608,9 @@ defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>; } // End FPDPRounding = 1 -defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>; -defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16>; -defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16>; +defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16, lshl_rev>; +defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16, lshr_rev>; +defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16, ashr_rev>; let isCommutable = 1 in { let FPDPRounding = 1 in { @@ -620,16 +620,16 @@ defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>; def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">; } // End FPDPRounding = 1 -defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>; -defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>; +defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16, add>; +defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16, sub>; defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">; -defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16>; +defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16, mul>; defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum_like>; defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum_like>; -defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16>; -defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16>; -defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16>; -defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16>; +defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16, umax>; +defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16, smax>; +defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16, umin>; +defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16, smin>; let Constraints = "$vdst = $src2", DisableEncoding="$src2", isConvertibleToThreeAddress = 1 in { @@ -722,53 +722,17 @@ // Note: 16-bit instructions produce a 0 result in the high 16-bits // on GFX8 and GFX9 and preserve high 16 bits on GFX10+ -def ClearHI16 : OutPatFrag<(ops node:$op), - (V_AND_B32_e64 $op, (V_MOV_B32_e32 (i32 0xffff)))>; - -multiclass Arithmetic_i16_Pats { - -def : GCNPat< - (op i16:$src0, i16:$src1), - !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1)) ->; +multiclass Arithmetic_i16_0Hi_Pats { def : GCNPat< (i32 (zext (op i16:$src0, i16:$src1))), - !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1)) + (inst $src0, $src1) >; def : GCNPat< (i64 (zext (op i16:$src0, i16:$src1))), (REG_SEQUENCE VReg_64, - !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1)), - sub0, - (V_MOV_B32_e32 (i32 0)), sub1) ->; -} - -multiclass Bits_OpsRev_i16_Pats { - -def : GCNPat< - (op i16:$src0, i16:$src1), - !if(!eq(PreservesHI16,1), (ClearHI16 (inst VSrc_b32:$src1, VSrc_b32:$src0)), - (inst VSrc_b32:$src1, VSrc_b32:$src0)) ->; - -def : GCNPat< - (i32 (zext (op i16:$src0, i16:$src1))), - !if(!eq(PreservesHI16,1), (ClearHI16 (inst VSrc_b32:$src1, VSrc_b32:$src0)), - (inst VSrc_b32:$src1, VSrc_b32:$src0)) ->; - - -def : GCNPat< - (i64 (zext (op i16:$src0, i16:$src1))), - (REG_SEQUENCE VReg_64, - !if(!eq(PreservesHI16,1), (ClearHI16 (inst VSrc_b32:$src1, VSrc_b32:$src0)), - (inst VSrc_b32:$src1, VSrc_b32:$src0)), - sub0, + (inst $src0, $src1), sub0, (V_MOV_B32_e32 (i32 0)), sub1) >; } @@ -800,35 +764,16 @@ let Predicates = [Has16BitInsts] in { let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in { -defm : Arithmetic_i16_Pats; -defm : Arithmetic_i16_Pats; -defm : Arithmetic_i16_Pats; -defm : Arithmetic_i16_Pats; -defm : Arithmetic_i16_Pats; -defm : Arithmetic_i16_Pats; -defm : Arithmetic_i16_Pats; -} - -let Predicates = [Has16BitInsts, isGFX10Plus] in { -defm : Arithmetic_i16_Pats; -defm : Arithmetic_i16_Pats; -defm : Arithmetic_i16_Pats; -defm : Arithmetic_i16_Pats; -defm : Arithmetic_i16_Pats; -defm : Arithmetic_i16_Pats; -defm : Arithmetic_i16_Pats; -} - -let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in { -defm : Bits_OpsRev_i16_Pats; -defm : Bits_OpsRev_i16_Pats; -defm : Bits_OpsRev_i16_Pats; -} - -let Predicates = [Has16BitInsts, isGFX10Plus] in { -defm : Bits_OpsRev_i16_Pats; -defm : Bits_OpsRev_i16_Pats; -defm : Bits_OpsRev_i16_Pats; +defm : Arithmetic_i16_0Hi_Pats; +defm : Arithmetic_i16_0Hi_Pats; +defm : Arithmetic_i16_0Hi_Pats; +defm : Arithmetic_i16_0Hi_Pats; +defm : Arithmetic_i16_0Hi_Pats; +defm : Arithmetic_i16_0Hi_Pats; +defm : Arithmetic_i16_0Hi_Pats; +defm : Arithmetic_i16_0Hi_Pats; +defm : Arithmetic_i16_0Hi_Pats; +defm : Arithmetic_i16_0Hi_Pats; } def : ZExt_i16_i1_Pat; Index: test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir =================================================================== --- test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir +++ test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir @@ -78,10 +78,8 @@ ; GFX10: $vcc_hi = IMPLICIT_DEF ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX10: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0 - ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec ; GFX10: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_ASHRREV_I16_e64_]], [[V_MOV_B32_e32_]], implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] + ; GFX10: S_ENDPGM 0, implicit [[V_ASHRREV_I16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:sgpr(s32) = COPY $sgpr0 %2:vgpr(s16) = G_TRUNC %0 @@ -147,10 +145,8 @@ ; GFX10: $vcc_hi = IMPLICIT_DEF ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec ; GFX10: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_ASHRREV_I16_e64_]], [[V_MOV_B32_e32_]], implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] + ; GFX10: S_ENDPGM 0, implicit [[V_ASHRREV_I16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -184,10 +180,8 @@ ; GFX10: $vcc_hi = IMPLICIT_DEF ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec ; GFX10: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_ASHRREV_I16_e64_]], [[V_MOV_B32_e32_]], implicit $exec - ; GFX10: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_AND_B32_e64_]], 0, 16, implicit $exec + ; GFX10: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_ASHRREV_I16_e64_]], 0, 16, implicit $exec ; GFX10: S_ENDPGM 0, implicit [[V_BFE_U32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 @@ -329,10 +323,8 @@ ; GFX10: $vcc_hi = IMPLICIT_DEF ; GFX10: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec ; GFX10: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_ASHRREV_I16_e64_]], [[V_MOV_B32_e32_]], implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] + ; GFX10: S_ENDPGM 0, implicit [[V_ASHRREV_I16_e64_]] %0:sgpr(s32) = COPY $sgpr0 %1:vgpr(s32) = COPY $vgpr0 %2:sgpr(s16) = G_TRUNC %0 Index: test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir =================================================================== --- test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir +++ test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir @@ -78,10 +78,8 @@ ; GFX10: $vcc_hi = IMPLICIT_DEF ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX10: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0 - ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec ; GFX10: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_LSHRREV_B16_e64_]], [[V_MOV_B32_e32_]], implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] + ; GFX10: S_ENDPGM 0, implicit [[V_LSHRREV_B16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:sgpr(s32) = COPY $sgpr0 %2:vgpr(s16) = G_TRUNC %0 @@ -147,10 +145,8 @@ ; GFX10: $vcc_hi = IMPLICIT_DEF ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec ; GFX10: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_LSHRREV_B16_e64_]], [[V_MOV_B32_e32_]], implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] + ; GFX10: S_ENDPGM 0, implicit [[V_LSHRREV_B16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -184,10 +180,8 @@ ; GFX10: $vcc_hi = IMPLICIT_DEF ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec ; GFX10: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_LSHRREV_B16_e64_]], [[V_MOV_B32_e32_]], implicit $exec - ; GFX10: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_AND_B32_e64_]], 0, 16, implicit $exec + ; GFX10: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_LSHRREV_B16_e64_]], 0, 16, implicit $exec ; GFX10: S_ENDPGM 0, implicit [[V_BFE_U32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 @@ -329,10 +323,8 @@ ; GFX10: $vcc_hi = IMPLICIT_DEF ; GFX10: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec ; GFX10: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_LSHRREV_B16_e64_]], [[V_MOV_B32_e32_]], implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] + ; GFX10: S_ENDPGM 0, implicit [[V_LSHRREV_B16_e64_]] %0:sgpr(s32) = COPY $sgpr0 %1:vgpr(s32) = COPY $vgpr0 %2:sgpr(s16) = G_TRUNC %0 Index: test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir =================================================================== --- test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir +++ test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir @@ -78,10 +78,8 @@ ; GFX10: $vcc_hi = IMPLICIT_DEF ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX10: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0 - ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec ; GFX10: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_LSHLREV_B16_e64_]], [[V_MOV_B32_e32_]], implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] + ; GFX10: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:sgpr(s32) = COPY $sgpr0 %2:vgpr(s16) = G_TRUNC %0 @@ -147,10 +145,8 @@ ; GFX10: $vcc_hi = IMPLICIT_DEF ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec ; GFX10: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_LSHLREV_B16_e64_]], [[V_MOV_B32_e32_]], implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] + ; GFX10: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -184,10 +180,8 @@ ; GFX10: $vcc_hi = IMPLICIT_DEF ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec ; GFX10: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_LSHLREV_B16_e64_]], [[V_MOV_B32_e32_]], implicit $exec - ; GFX10: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_AND_B32_e64_]], 0, 16, implicit $exec + ; GFX10: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_LSHLREV_B16_e64_]], 0, 16, implicit $exec ; GFX10: S_ENDPGM 0, implicit [[V_BFE_U32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 @@ -329,10 +323,8 @@ ; GFX10: $vcc_hi = IMPLICIT_DEF ; GFX10: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec ; GFX10: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_LSHLREV_B16_e64_]], [[V_MOV_B32_e32_]], implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] + ; GFX10: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]] %0:sgpr(s32) = COPY $sgpr0 %1:vgpr(s32) = COPY $vgpr0 %2:sgpr(s16) = G_TRUNC %0 Index: test/CodeGen/AMDGPU/idot2.ll =================================================================== --- test/CodeGen/AMDGPU/idot2.ll +++ test/CodeGen/AMDGPU/idot2.ll @@ -2775,7 +2775,6 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 @@ -2784,13 +2783,13 @@ ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: global_load_ushort v2, v[2:3], off -; GFX10-DL-NEXT: global_load_ushort v7, v[0:1], off +; GFX10-DL-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v1, 8, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_and_b32_sdwa v3, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, v0 ; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX10-DL-NEXT: v_bfe_i32 v0, v7, 0, 8 +; GFX10-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX10-DL-NEXT: v_bfe_i32 v3, v3, 0, 8 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) Index: test/CodeGen/AMDGPU/idot4s.ll =================================================================== --- test/CodeGen/AMDGPU/idot4s.ll +++ test/CodeGen/AMDGPU/idot4s.ll @@ -841,7 +841,6 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 @@ -850,19 +849,19 @@ ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_and_b32_sdwa v3, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v2, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s2 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s3 ; GFX10-DL-NEXT: v_mov_b32_e32 v4, s4 ; GFX10-DL-NEXT: s_sext_i32_i8 s0, s2 ; GFX10-DL-NEXT: s_sext_i32_i8 s1, s3 -; GFX10-DL-NEXT: v_bfe_i32 v3, v3, 0, 8 ; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX10-DL-NEXT: v_bfe_i32 v3, v3, 0, 8 ; GFX10-DL-NEXT: s_bfe_i32 s4, s2, 0x80010 ; GFX10-DL-NEXT: s_bfe_i32 s5, s3, 0x80010 ; GFX10-DL-NEXT: v_mad_i32_i24 v4, s0, s1, v4 ; GFX10-DL-NEXT: s_ashr_i32 s0, s2, 24 ; GFX10-DL-NEXT: s_ashr_i32 s1, s3, 24 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, v3, v2, v4 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, v2, v3, v4 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off @@ -1057,16 +1056,16 @@ ; GFX10-DL-NEXT: s_bfe_i32 s1, s3, 0x80000 ; GFX10-DL-NEXT: s_lshr_b32 s4, s2, 16 ; GFX10-DL-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10-DL-NEXT: v_and_b32_sdwa v4, sext(s2), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 8, s2 ; GFX10-DL-NEXT: v_and_b32_e32 v7, s0, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v6, s1, v2 -; GFX10-DL-NEXT: v_and_b32_sdwa v5, sext(s3), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 8, s3 ; GFX10-DL-NEXT: s_bfe_i32 s0, s4, 0x80000 ; GFX10-DL-NEXT: s_bfe_i32 s1, s5, 0x80000 ; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v7 -; GFX10-DL-NEXT: v_and_b32_sdwa v8, sext(s4), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 8, s4 ; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6 -; GFX10-DL-NEXT: v_and_b32_sdwa v6, sext(s5), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 8, s5 ; GFX10-DL-NEXT: v_and_b32_e32 v7, s1, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 Index: test/CodeGen/AMDGPU/idot4u.ll =================================================================== --- test/CodeGen/AMDGPU/idot4u.ll +++ test/CodeGen/AMDGPU/idot4u.ll @@ -1738,28 +1738,30 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff -; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX10-DL-NEXT: s_movk_i32 s3, 0xff +; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s1, s4, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-DL-NEXT: v_and_b32_sdwa v4, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v2, s4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-DL-NEXT: s_bfe_u32 s2, s3, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s4, 0x80010 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s0, s1, v3 -; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 24 -; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, v4, v2, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s5, v2 +; GFX10-DL-NEXT: s_and_b32 s0, s4, s3 +; GFX10-DL-NEXT: s_and_b32 s1, s5, s3 +; GFX10-DL-NEXT: v_mov_b32_e32 v4, s6 +; GFX10-DL-NEXT: v_and_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-DL-NEXT: v_and_b32_sdwa v3, s2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-DL-NEXT: s_bfe_u32 s3, s4, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s2, s5, 0x80010 +; GFX10-DL-NEXT: v_mad_u32_u24 v4, s0, s1, v4 +; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 24 +; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 24 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, v2, v3, v4 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -1938,9 +1940,9 @@ ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_and_b32_sdwa v4, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s2 ; GFX10-DL-NEXT: v_and_b32_sdwa v7, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v5, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v5, 8, s3 ; GFX10-DL-NEXT: v_and_b32_sdwa v6, v2, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 16 ; GFX10-DL-NEXT: s_lshr_b32 s1, s3, 16 @@ -2150,40 +2152,36 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: global_load_ubyte v3, v[0:1], off +; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_and_b32_sdwa v4, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v5, s4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 24 -; GFX10-DL-NEXT: s_lshr_b32 s1, s3, 16 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s3, s4 -; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 16 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v4, v5 -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 24 -; GFX10-DL-NEXT: v_and_b32_sdwa v5, v6, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s1, s3 -; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s0, s4 -; GFX10-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v5, v6, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v2, v7, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX10-DL-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-DL-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s2 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s3 +; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 24 +; GFX10-DL-NEXT: s_lshr_b32 s1, s3, 24 +; GFX10-DL-NEXT: s_lshr_b32 s4, s2, 16 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s2, s3 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, v3, v4 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s0, s1 +; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 16 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v4 +; GFX10-DL-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s4, s0 +; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v4, v3 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v5 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, Index: test/CodeGen/AMDGPU/idot8s.ll =================================================================== --- test/CodeGen/AMDGPU/idot8s.ll +++ test/CodeGen/AMDGPU/idot8s.ll @@ -473,49 +473,47 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off +; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 12 -; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 12 -; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x40000 +; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 12 +; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 12 ; GFX10-DL-NEXT: s_bfe_i32 s6, s4, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s7, s2, 0x40004 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s0 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s1 -; GFX10-DL-NEXT: s_bfe_i32 s0, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_i32 s1, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40008 -; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v5, v5, v2 -; GFX10-DL-NEXT: s_bfe_i32 s9, s2, 0x40010 -; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x40010 -; GFX10-DL-NEXT: v_mul_i32_i24_e64 v6, s1, s8 +; GFX10-DL-NEXT: s_bfe_i32 s7, s5, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40004 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 +; GFX10-DL-NEXT: s_bfe_i32 s9, s5, 0x40004 +; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s11, s5, 0x40008 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v5 -; GFX10-DL-NEXT: s_bfe_i32 s1, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s11, s2, 0x40018 -; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v2, v5, v2 +; GFX10-DL-NEXT: s_bfe_i32 s0, s4, 0x40010 +; GFX10-DL-NEXT: s_bfe_i32 s1, s5, 0x40010 +; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s10, s11 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x40014 +; GFX10-DL-NEXT: s_bfe_i32 s11, s5, 0x40014 ; GFX10-DL-NEXT: s_bfe_i32 s12, s4, 0x40018 -; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 28 +; GFX10-DL-NEXT: s_bfe_i32 s2, s5, 0x40018 ; GFX10-DL-NEXT: s_ashr_i32 s4, s4, 28 +; GFX10-DL-NEXT: s_ashr_i32 s5, s5, 28 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s5, s6, v3 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s7, s0, v3 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, v4, v2, v3 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s9, s10, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s8, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s11, s12, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s7, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s8, s9, v2 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s10, s11, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s12, s2, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -818,7 +816,6 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX10-DL-NEXT: s_movk_i32 s2, 0xff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) @@ -826,40 +823,38 @@ ; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: global_load_ubyte v3, v[0:1], off +; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 12 ; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 12 ; GFX10-DL-NEXT: s_bfe_i32 s6, s4, 0x40000 ; GFX10-DL-NEXT: s_bfe_i32 s7, s5, 0x40000 ; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40004 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s0 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s1 -; GFX10-DL-NEXT: s_bfe_i32 s0, s5, 0x40004 -; GFX10-DL-NEXT: s_bfe_i32 s1, s4, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s9, s5, 0x40008 -; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v2, v5, v2 -; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x40010 -; GFX10-DL-NEXT: s_bfe_i32 s11, s5, 0x40010 -; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s1, s9 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 +; GFX10-DL-NEXT: s_bfe_i32 s9, s5, 0x40004 +; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s11, s5, 0x40008 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v2, 12, v2 -; GFX10-DL-NEXT: s_bfe_i32 s1, s4, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s9, s5, 0x40014 +; GFX10-DL-NEXT: s_bfe_i32 s0, s4, 0x40010 +; GFX10-DL-NEXT: s_bfe_i32 s1, s5, 0x40010 +; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s10, s11 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x40014 +; GFX10-DL-NEXT: s_bfe_i32 s11, s5, 0x40014 ; GFX10-DL-NEXT: s_bfe_i32 s12, s4, 0x40018 -; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v2, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-DL-NEXT: s_bfe_i32 s2, s5, 0x40018 ; GFX10-DL-NEXT: s_ashr_i32 s4, s4, 28 ; GFX10-DL-NEXT: s_ashr_i32 s5, s5, 28 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s6, s7, v3 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s8, s0, v3 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, v4, v2, v3 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s7, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s8, s9, v2 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s10, s11, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s9, v2 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s12, s2, v2 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off @@ -2289,133 +2284,94 @@ ; ; GFX10-DL-LABEL: idot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff -; GFX10-DL-NEXT: ; implicit-def: $vcc_hi -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ubyte v3, v[0:1], off +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff +; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s9, s0, 4 -; GFX10-DL-NEXT: s_lshr_b32 s16, s1, 4 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s0 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s1 -; GFX10-DL-NEXT: s_lshr_b32 s10, s0, 8 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s9 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s16 -; GFX10-DL-NEXT: s_lshr_b32 s11, s0, 12 -; GFX10-DL-NEXT: s_lshr_b32 s17, s1, 8 -; GFX10-DL-NEXT: s_lshr_b32 s18, s1, 12 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s10 -; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v5, v5, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v8, v8, v2 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s18 -; GFX10-DL-NEXT: v_and_b32_e32 v15, v15, v2 -; GFX10-DL-NEXT: s_lshr_b32 s7, s0, 24 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v23, 12, s11 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v31, 12, s17 -; GFX10-DL-NEXT: v_and_b32_e32 v7, v7, v2 +; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 4 +; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 4 +; GFX10-DL-NEXT: s_lshr_b32 s6, s4, 12 +; GFX10-DL-NEXT: s_lshr_b32 s7, s5, 12 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s4 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s6 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s7 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s5 +; GFX10-DL-NEXT: s_lshr_b32 s8, s4, 8 +; GFX10-DL-NEXT: s_lshr_b32 s0, s5, 8 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 -; GFX10-DL-NEXT: v_and_b32_e32 v13, v13, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v6, v23, v2 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v5 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s8 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s0 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v7 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, v3, v4 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 12, v6 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 12, v8 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v15, 12, v15 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v27, 12, s7 -; GFX10-DL-NEXT: v_and_b32_e32 v14, v31, v2 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v23, 12, v6 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v9 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v5, v7 +; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 20 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v10 +; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 20 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, v6, v8 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s0 +; GFX10-DL-NEXT: s_lshr_b32 s8, s5, 16 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s1 +; GFX10-DL-NEXT: s_lshr_b32 s9, s5, 28 +; GFX10-DL-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NEXT: s_lshr_b32 s7, s4, 28 +; GFX10-DL-NEXT: s_lshr_b32 s6, s4, 16 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v4, v9 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v5 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s9 +; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 24 +; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 24 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s7 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s6 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX10-DL-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v7 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v13, 12, v13 -; GFX10-DL-NEXT: v_and_b32_e32 v10, v27, v2 -; GFX10-DL-NEXT: s_lshr_b32 s5, s0, 16 -; GFX10-DL-NEXT: s_lshr_b32 s6, s0, 20 -; GFX10-DL-NEXT: s_lshr_b32 s12, s1, 16 -; GFX10-DL-NEXT: s_lshr_b32 s13, s1, 20 -; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v5, v5, v2 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v27, 12, v14 -; GFX10-DL-NEXT: v_and_b32_e32 v8, v8, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v15, v15, v2 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s6 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v4, v5 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s5 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v19, 12, s12 -; GFX10-DL-NEXT: s_lshr_b32 s8, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s14, s1, 24 -; GFX10-DL-NEXT: s_lshr_b32 s15, s1, 28 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v35, 12, s13 -; GFX10-DL-NEXT: v_and_b32_e32 v6, v23, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v7, v7, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v5, v27, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v13, v13, v2 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v8, v8, v15 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s8 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s15 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v7, v5 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v17, 12, s14 -; GFX10-DL-NEXT: v_and_b32_e32 v11, v11, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v12, v12, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v18, v35, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v19, v19, v2 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v15, v6, v13 -; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v7, v8, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_e32 v9, v9, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v16, v16, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v17, v17, v2 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 12, v8 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s8 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s0 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s1 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v15, 12, v9 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v19, 12, v6 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v13 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v10, 12, v10 +; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v4 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v7, v8 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v11 -; GFX10-DL-NEXT: v_or_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v12 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v35, 12, v18 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v19, 12, v19 -; GFX10-DL-NEXT: v_and_b32_sdwa v5, v5, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v6, v15, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v9 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v31, 12, v10 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v16, 12, v16 -; GFX10-DL-NEXT: v_and_b32_e32 v7, v11, v2 -; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-DL-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v17, 12, v17 -; GFX10-DL-NEXT: v_and_b32_e32 v10, v12, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v11, v19, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v6, v35, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v8, v9, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v13, v16, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v9, v31, v2 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v10, v10, v11 -; GFX10-DL-NEXT: v_and_b32_e32 v12, v17, v2 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v11, v7, v6 -; GFX10-DL-NEXT: v_or_b32_e32 v5, v4, v5 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v8, v8, v13 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v9, v12 -; GFX10-DL-NEXT: v_and_b32_sdwa v9, v10, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v5 -; GFX10-DL-NEXT: v_and_b32_sdwa v7, v7, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, v19, v10 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v15, v9 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v4 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 8, v6 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v4, v3 -; GFX10-DL-NEXT: v_and_b32_sdwa v4, v11, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v10 -; GFX10-DL-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-DL-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 -; GFX10-DL-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX10-DL-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v2 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v4 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v7 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v11, v12 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v8 +; GFX10-DL-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_or_b32_sdwa v5, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v5 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v4 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, Index: test/CodeGen/AMDGPU/idot8u.ll =================================================================== --- test/CodeGen/AMDGPU/idot8u.ll +++ test/CodeGen/AMDGPU/idot8u.ll @@ -2550,7 +2550,6 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) @@ -2558,7 +2557,7 @@ ; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: global_load_ubyte v3, v[0:1], off +; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40004 ; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40004 @@ -2566,47 +2565,47 @@ ; GFX10-DL-NEXT: s_and_b32 s8, s5, 15 ; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c ; GFX10-DL-NEXT: s_bfe_u32 s9, s5, 0x4000c -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s0, s1 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s0, s1 ; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40008 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s6, s8 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s6, s8 ; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40008 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v11, s7, s9 -; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s7, s9 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3 ; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014 ; GFX10-DL-NEXT: s_lshr_b32 s7, s4, 28 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s0, s1 -; GFX10-DL-NEXT: v_and_b32_sdwa v6, v11, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s0, s1 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v5 +; GFX10-DL-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX10-DL-NEXT: s_bfe_u32 s0, s5, 0x40014 ; GFX10-DL-NEXT: s_lshr_b32 s9, s5, 28 ; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40010 -; GFX10-DL-NEXT: v_or_b32_sdwa v5, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX10-DL-NEXT: v_and_b32_e32 v4, s2, v4 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v11, s6, s0 +; GFX10-DL-NEXT: v_or_b32_sdwa v4, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s6, s0 ; GFX10-DL-NEXT: s_bfe_u32 s8, s5, 0x40010 ; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40018 ; GFX10-DL-NEXT: s_bfe_u32 s4, s5, 0x40018 -; GFX10-DL-NEXT: v_or_b32_e32 v5, v4, v5 -; GFX10-DL-NEXT: v_and_b32_sdwa v6, v11, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s1, s8 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v8, s7, s9 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v9, s0, s4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v5 -; GFX10-DL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_e32 v6, s2, v6 -; GFX10-DL-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX10-DL-NEXT: v_or_b32_e32 v2, v6, v7 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v14, 8, v2 +; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v4 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v5 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s1, s8 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s7, s9 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v8, s0, s4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v4 +; GFX10-DL-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 8, v7 +; GFX10-DL-NEXT: v_and_b32_e32 v5, s2, v5 +; GFX10-DL-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-DL-NEXT: v_or_b32_e32 v11, v5, v6 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v11 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v4, v3 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v10 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v6 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v14 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v9 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v7 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, Index: test/CodeGen/AMDGPU/preserve-hi16.ll =================================================================== --- test/CodeGen/AMDGPU/preserve-hi16.ll +++ test/CodeGen/AMDGPU/preserve-hi16.ll @@ -3,8 +3,8 @@ ; GCN-LABEL: {{^}}shl_i16: ; GCN: v_lshlrev_b16_e{{32|64}} [[OP:v[0-9]+]], -; GFX9-NEXT: s_setpc_b64 -; GFX10: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: s_setpc_b64 define i16 @shl_i16(i16 %x, i16 %y) { %res = shl i16 %x, %y ret i16 %res @@ -12,8 +12,8 @@ ; GCN-LABEL: {{^}}lshr_i16: ; GCN: v_lshrrev_b16_e{{32|64}} [[OP:v[0-9]+]], -; GFX9-NEXT: s_setpc_b64 -; GFX10: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: s_setpc_b64 define i16 @lshr_i16(i16 %x, i16 %y) { %res = lshr i16 %x, %y ret i16 %res @@ -21,8 +21,8 @@ ; GCN-LABEL: {{^}}ashr_i16: ; GCN: v_ashrrev_i16_e{{32|64}} [[OP:v[0-9]+]], -; GFX9-NEXT: s_setpc_b64 -; GFX10: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: s_setpc_b64 define i16 @ashr_i16(i16 %x, i16 %y) { %res = ashr i16 %x, %y ret i16 %res @@ -30,8 +30,8 @@ ; GCN-LABEL: {{^}}add_u16: ; GCN: v_add_{{(nc_)*}}u16_e{{32|64}} [[OP:v[0-9]+]], -; GFX9-NEXT: s_setpc_b64 -; GFX10: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: s_setpc_b64 define i16 @add_u16(i16 %x, i16 %y) { %res = add i16 %x, %y ret i16 %res @@ -39,8 +39,8 @@ ; GCN-LABEL: {{^}}sub_u16: ; GCN: v_sub_{{(nc_)*}}u16_e{{32|64}} [[OP:v[0-9]+]], -; GFX9-NEXT: s_setpc_b64 -; GFX10: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: s_setpc_b64 define i16 @sub_u16(i16 %x, i16 %y) { %res = sub i16 %x, %y ret i16 %res @@ -48,8 +48,8 @@ ; GCN-LABEL: {{^}}mul_lo_u16: ; GCN: v_mul_lo_u16_e{{32|64}} [[OP:v[0-9]+]], -; GFX9-NEXT: s_setpc_b64 -; GFX10: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: s_setpc_b64 define i16 @mul_lo_u16(i16 %x, i16 %y) { %res = mul i16 %x, %y ret i16 %res @@ -57,8 +57,8 @@ ; GCN-LABEL: {{^}}min_u16: ; GCN: v_min_u16_e{{32|64}} [[OP:v[0-9]+]], -; GFX9-NEXT: s_setpc_b64 -; GFX10: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: s_setpc_b64 define i16 @min_u16(i16 %x, i16 %y) { %cmp = icmp ule i16 %x, %y %res = select i1 %cmp, i16 %x, i16 %y @@ -67,8 +67,8 @@ ; GCN-LABEL: {{^}}min_i16: ; GCN: v_min_i16_e{{32|64}} [[OP:v[0-9]+]], -; GFX9-NEXT: s_setpc_b64 -; GFX10: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: s_setpc_b64 define i16 @min_i16(i16 %x, i16 %y) { %cmp = icmp sle i16 %x, %y %res = select i1 %cmp, i16 %x, i16 %y @@ -77,8 +77,8 @@ ; GCN-LABEL: {{^}}max_u16: ; GCN: v_max_u16_e{{32|64}} [[OP:v[0-9]+]], -; GFX9-NEXT: s_setpc_b64 -; GFX10: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: s_setpc_b64 define i16 @max_u16(i16 %x, i16 %y) { %cmp = icmp uge i16 %x, %y %res = select i1 %cmp, i16 %x, i16 %y @@ -87,10 +87,124 @@ ; GCN-LABEL: {{^}}max_i16: ; GCN: v_max_i16_e{{32|64}} [[OP:v[0-9]+]], -; GFX9-NEXT: s_setpc_b64 -; GFX10: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: s_setpc_b64 define i16 @max_i16(i16 %x, i16 %y) { %cmp = icmp sge i16 %x, %y %res = select i1 %cmp, i16 %x, i16 %y ret i16 %res } + +; GCN-LABEL: {{^}}shl_i16_zext_i32: +; GCN: v_lshlrev_b16_e{{32|64}} [[OP:v[0-9]+]], +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]] +; GCN-NEXT: s_setpc_b64 +define i32 @shl_i16_zext_i32(i16 %x, i16 %y) { + %res = shl i16 %x, %y + %zext = zext i16 %res to i32 + ret i32 %zext +} + +; GCN-LABEL: {{^}}lshr_i16_zext_i32: +; GCN: v_lshrrev_b16_e{{32|64}} [[OP:v[0-9]+]], +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]] +; GCN-NEXT: s_setpc_b64 +define i32 @lshr_i16_zext_i32(i16 %x, i16 %y) { + %res = lshr i16 %x, %y + %zext = zext i16 %res to i32 + ret i32 %zext +} + +; GCN-LABEL: {{^}}ashr_i16_zext_i32: +; GCN: v_ashrrev_i16_e{{32|64}} [[OP:v[0-9]+]], +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]] +; GCN-NEXT: s_setpc_b64 +define i32 @ashr_i16_zext_i32(i16 %x, i16 %y) { + %res = ashr i16 %x, %y + %zext = zext i16 %res to i32 + ret i32 %zext +} + +; GCN-LABEL: {{^}}add_u16_zext_i32: +; GCN: v_add_{{(nc_)*}}u16_e{{32|64}} [[OP:v[0-9]+]], +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]] +; GCN-NEXT: s_setpc_b64 +define i32 @add_u16_zext_i32(i16 %x, i16 %y) { + %res = add i16 %x, %y + %zext = zext i16 %res to i32 + ret i32 %zext +} + +; GCN-LABEL: {{^}}sub_u16_zext_i32: +; GCN: v_sub_{{(nc_)*}}u16_e{{32|64}} [[OP:v[0-9]+]], +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]] +; GCN-NEXT: s_setpc_b64 +define i32 @sub_u16_zext_i32(i16 %x, i16 %y) { + %res = sub i16 %x, %y + %zext = zext i16 %res to i32 + ret i32 %zext +} + +; GCN-LABEL: {{^}}mul_lo_u16_zext_i32: +; GCN: v_mul_lo_u16_e{{32|64}} [[OP:v[0-9]+]], +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]] +; GCN-NEXT: s_setpc_b64 +define i32 @mul_lo_u16_zext_i32(i16 %x, i16 %y) { + %res = mul i16 %x, %y + %zext = zext i16 %res to i32 + ret i32 %zext +} + +; GCN-LABEL: {{^}}min_u16_zext_i32: +; GCN: v_min_u16_e{{32|64}} [[OP:v[0-9]+]], +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]] +; GCN-NEXT: s_setpc_b64 +define i32 @min_u16_zext_i32(i16 %x, i16 %y) { + %cmp = icmp ule i16 %x, %y + %res = select i1 %cmp, i16 %x, i16 %y + %zext = zext i16 %res to i32 + ret i32 %zext +} + +; GCN-LABEL: {{^}}min_i16_zext_i32: +; GCN: v_min_i16_e{{32|64}} [[OP:v[0-9]+]], +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]] +; GCN-NEXT: s_setpc_b64 +define i32 @min_i16_zext_i32(i16 %x, i16 %y) { + %cmp = icmp sle i16 %x, %y + %res = select i1 %cmp, i16 %x, i16 %y + %zext = zext i16 %res to i32 + ret i32 %zext +} + +; GCN-LABEL: {{^}}max_u16_zext_i32: +; GCN: v_max_u16_e{{32|64}} [[OP:v[0-9]+]], +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]] +; GCN-NEXT: s_setpc_b64 +define i32 @max_u16_zext_i32(i16 %x, i16 %y) { + %cmp = icmp uge i16 %x, %y + %res = select i1 %cmp, i16 %x, i16 %y + %zext = zext i16 %res to i32 + ret i32 %zext +} + +; GCN-LABEL: {{^}}max_i16_zext_i32: +; GCN: v_max_i16_e{{32|64}} [[OP:v[0-9]+]], +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]] +; GCN-NEXT: s_setpc_b64 +define i32 @max_i16_zext_i32(i16 %x, i16 %y) { + %cmp = icmp sge i16 %x, %y + %res = select i1 %cmp, i16 %x, i16 %y + %zext = zext i16 %res to i32 + ret i32 %zext +} Index: test/CodeGen/AMDGPU/sdwa-peephole.ll =================================================================== --- test/CodeGen/AMDGPU/sdwa-peephole.ll +++ test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -283,11 +283,8 @@ ; GFX9: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 - - +; GFX10: v_lshlrev_b16_e64 v{{[0-9]+}}, 8, v +; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD define amdgpu_kernel void @mul_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %ina, <2 x i8> addrspace(1)* %inb) { entry: %a = load <2 x i8>, <2 x i8> addrspace(1)* %ina, align 4 @@ -501,10 +498,10 @@ ; ; GFX89: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; -; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD define amdgpu_kernel void @pulled_out_test(<8 x i8> addrspace(1)* %sourceA, <8 x i8> addrspace(1)* %destValues) { entry: