Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -618,11 +618,6 @@ return true; } -static bool isZero(Register Reg, const MachineRegisterInfo &MRI) { - int64_t Val; - return mi_match(Reg, MRI, m_ICst(Val)) && Val == 0; -} - bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC( MachineInstr &MI) const { if (selectImpl(MI, *CoverageInfo)) @@ -647,6 +642,20 @@ const DebugLoc &DL = MI.getDebugLoc(); MachineBasicBlock *BB = MI.getParent(); + auto ConstSrc1 = getConstantVRegValWithLookThrough(Src1, *MRI, true, true); + if (ConstSrc1) { + auto ConstSrc0 = getConstantVRegValWithLookThrough(Src0, *MRI, true, true); + if (ConstSrc0) { + uint32_t Lo16 = static_cast(ConstSrc0->Value) & 0xffff; + uint32_t Hi16 = static_cast(ConstSrc1->Value) & 0xffff; + + BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst) + .addImm(Lo16 | (Hi16 << 16)); + MI.eraseFromParent(); + return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI); + } + } + // TODO: This should probably be a combine somewhere // (build_vector_trunc $src0, undef -> copy $src0 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI); @@ -688,7 +697,7 @@ } else if (Shift1) { Opc = AMDGPU::S_PACK_LH_B32_B16; MI.getOperand(2).setReg(ShiftSrc1); - } else if (Shift0 && isZero(Src1, *MRI)) { + } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) { // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst) .addReg(ShiftSrc0) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll @@ -91,9 +91,8 @@ ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_splat: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0xffffffc0 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: v_pk_add_u16 v0, v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffc0ffc0 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_splat: @@ -113,8 +112,8 @@ ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_lo: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, 0xffffffc0, 4 -; GFX9-NEXT: v_pk_add_u16 v0, v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x4ffc0 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_lo: @@ -133,8 +132,8 @@ ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_hi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, 4, 0xffffffc0 -; GFX9-NEXT: v_pk_add_u16 v0, v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffc00004 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_hi: @@ -152,13 +151,10 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) { ; GFX9-LABEL: s_add_v2i16_neg_inline_imm_splat: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s1, 0xffffffc0 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s1 -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s1, s0, 16 +; GFX9-NEXT: s_add_i32 s0, s0, 0xffc0ffc0 +; GFX9-NEXT: s_add_i32 s1, s1, 0xffc0 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_splat: @@ -182,12 +178,10 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) { ; GFX9-LABEL: s_add_v2i16_neg_inline_imm_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s1, 0xffffffc0, 4 -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s1, s0, 16 +; GFX9-NEXT: s_add_i32 s0, s0, 0x4ffc0 +; GFX9-NEXT: s_add_i32 s1, s1, 4 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_lo: @@ -210,12 +204,10 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) { ; GFX9-LABEL: s_add_v2i16_neg_inline_imm_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s1, 4, 0xffffffc0 -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s1, s0, 16 +; GFX9-NEXT: s_add_i32 s0, s0, 0xffc00004 +; GFX9-NEXT: s_add_i32 s1, s1, 0xffc0 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_hi: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -522,8 +522,7 @@ ; GFX9-LABEL: v_ashr_v2i16_15: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, 15, 15 -; GFX9-NEXT: v_pk_ashrrev_i16 v0, s4, v0 +; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v0 op_sel_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = ashr <2 x i16> %value, ret <2 x i16> %result Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-build-vector-trunc.v2s16.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-build-vector-trunc.v2s16.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-build-vector-trunc.v2s16.mir @@ -430,3 +430,273 @@ %5:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %3, %4 S_ENDPGM 0, implicit %5 ... + +--- +name: test_build_vector_trunc_s_v2s16_constant_constant +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_constant_constant + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 29884539 + ; GFX9: S_ENDPGM 0, implicit [[S_MOV_B32_]] + %0:sgpr(s32) = G_CONSTANT i32 123 + %1:sgpr(s32) = G_CONSTANT i32 456 + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: test_build_vector_trunc_s_v2s16_constant_impdef +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_constant_impdef + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 123 + ; GFX9: S_ENDPGM 0, implicit [[S_MOV_B32_]] + %0:sgpr(s32) = G_CONSTANT i32 123 + %1:sgpr(s32) = G_IMPLICIT_DEF + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: test_build_vector_trunc_s_v2s16_impdef_constant +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_impdef_constant + ; GFX9: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 123 + ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[DEF]], [[S_MOV_B32_]] + ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] + %0:sgpr(s32) = G_IMPLICIT_DEF + %1:sgpr(s32) = G_CONSTANT i32 123 + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: test_build_vector_trunc_s_v2s16_impdef_impdef +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_impdef_impdef + ; GFX9: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GFX9: S_ENDPGM 0, implicit [[DEF]] + %0:sgpr(s32) = G_IMPLICIT_DEF + %1:sgpr(s32) = G_IMPLICIT_DEF + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: test_build_vector_trunc_s_v2s16_zext_constant_zext_constant +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_zext_constant_zext_constant + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 29884539 + ; GFX9: S_ENDPGM 0, implicit [[S_MOV_B32_]] + %0:sgpr(s16) = G_CONSTANT i16 123 + %1:sgpr(s16) = G_CONSTANT i16 456 + %2:sgpr(s32) = G_ZEXT %0 + %3:sgpr(s32) = G_ZEXT %1 + %4:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: test_build_vector_trunc_s_v2s16_zext_impdef_zext_constant +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_zext_impdef_zext_constant + ; GFX9: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 123 + ; GFX9: [[S_BFE_U32_:%[0-9]+]]:sreg_32 = S_BFE_U32 [[DEF]], 1048576, implicit-def $scc + ; GFX9: [[S_BFE_U32_1:%[0-9]+]]:sreg_32 = S_BFE_U32 [[S_MOV_B32_]], 1048576, implicit-def $scc + ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[S_BFE_U32_]], [[S_BFE_U32_1]] + ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] + %0:sgpr(s16) = G_IMPLICIT_DEF + %1:sgpr(s16) = G_CONSTANT i16 123 + %2:sgpr(s32) = G_ZEXT %0 + %3:sgpr(s32) = G_ZEXT %1 + %4:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: test_build_vector_trunc_s_v2s16_sext_constant_sext_constant +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_sext_constant_sext_constant + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294836208 + ; GFX9: S_ENDPGM 0, implicit [[S_MOV_B32_]] + %0:sgpr(s16) = G_CONSTANT i16 -16 + %1:sgpr(s16) = G_CONSTANT i16 -3 + %2:sgpr(s32) = G_SEXT %0 + %3:sgpr(s32) = G_SEXT %1 + %4:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: test_build_vector_trunc_s_v2s16_anyext_constant_anyext_constant +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_anyext_constant_anyext_constant + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 123 + ; GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 456 + ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[S_MOV_B32_]], [[S_MOV_B32_1]] + ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] + %0:sgpr(s16) = G_CONSTANT i16 123 + %1:sgpr(s16) = G_CONSTANT i16 456 + %2:sgpr(s32) = G_ANYEXT %0 + %3:sgpr(s32) = G_ANYEXT %1 + %4:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: test_build_vector_trunc_s_v2s16_anyext_impdef_anyext_constant +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_anyext_impdef_anyext_constant + ; GFX9: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 123 + ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[DEF]], [[S_MOV_B32_]] + ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] + %0:sgpr(s16) = G_IMPLICIT_DEF + %1:sgpr(s16) = G_CONSTANT i16 123 + %2:sgpr(s32) = G_ANYEXT %0 + %3:sgpr(s32) = G_ANYEXT %1 + %4:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: test_build_vector_trunc_s_v2s16_var_constant +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0 + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_var_constant + ; GFX9: liveins: $sgpr0 + ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 456 + ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[COPY]], [[S_MOV_B32_]] + ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = G_CONSTANT i32 456 + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: test_build_vector_trunc_s_v2s16_constant_var +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0 + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_constant_var + ; GFX9: liveins: $sgpr0 + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 456 + ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[S_MOV_B32_]], [[COPY]] + ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] + %0:sgpr(s32) = G_CONSTANT i32 456 + %1:sgpr(s32) = COPY $sgpr0 + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: test_build_vector_trunc_s_v2s16_var_0 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0 + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_var_0 + ; GFX9: liveins: $sgpr0 + ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[COPY]], [[S_MOV_B32_]] + ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = G_CONSTANT i32 0 + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: test_build_vector_trunc_s_v2s16_0_var +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0 + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_0_var + ; GFX9: liveins: $sgpr0 + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[S_MOV_B32_]], [[COPY]] + ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] + %0:sgpr(s32) = G_CONSTANT i32 0 + %1:sgpr(s32) = COPY $sgpr0 + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + S_ENDPGM 0, implicit %2 +... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll @@ -82,24 +82,21 @@ ; GFX906-LABEL: v_sdot2_inline_literal_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX906-NEXT: v_dot2_i32_i16 v0, s4, v0, v1 +; GFX906-NEXT: v_dot2_i32_i16 v0, 4, v0, v1 op_sel_hi:[0,1,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_inline_literal_a: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX908-NEXT: v_dot2_i32_i16 v0, s4, v0, v1 +; GFX908-NEXT: v_dot2_i32_i16 v0, 4, v0, v1 op_sel_hi:[0,1,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_inline_literal_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 4, 4 +; GFX10-NEXT: v_dot2_i32_i16 v0, 4, v0, v1 op_sel_hi:[0,1,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_dot2_i32_i16 v0, s4, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> , <2 x i16> %b, i32 %c, i1 false) ret i32 %r @@ -109,24 +106,21 @@ ; GFX906-LABEL: v_sdot2_inline_literal_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX906-NEXT: v_dot2_i32_i16 v0, v0, s4, v1 +; GFX906-NEXT: v_dot2_i32_i16 v0, v0, 4, v1 op_sel_hi:[1,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_inline_literal_b: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX908-NEXT: v_dot2_i32_i16 v0, v0, s4, v1 +; GFX908-NEXT: v_dot2_i32_i16 v0, v0, 4, v1 op_sel_hi:[1,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_inline_literal_b: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 4, 4 +; GFX10-NEXT: v_dot2_i32_i16 v0, v0, 4, v1 op_sel_hi:[1,0,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_dot2_i32_i16 v0, v0, s4, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> , i32 %c, i1 false) ret i32 %r @@ -136,29 +130,21 @@ ; GFX906-LABEL: v_sdot2_inline_literal_a_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4 -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX906-NEXT: v_mov_b32_e32 v0, s5 -; GFX906-NEXT: v_dot2_i32_i16 v0, s4, v0, v1 +; GFX906-NEXT: v_dot2_i32_i16 v0, 8, 4, v1 op_sel_hi:[0,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_inline_literal_a_b: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4 -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX908-NEXT: v_mov_b32_e32 v0, s5 -; GFX908-NEXT: v_dot2_i32_i16 v0, s4, v0, v1 +; GFX908-NEXT: v_dot2_i32_i16 v0, 8, 4, v1 op_sel_hi:[0,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_inline_literal_a_b: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, 4, 4 +; GFX10-NEXT: v_dot2_i32_i16 v0, 8, 4, v1 op_sel_hi:[0,0,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_dot2_i32_i16 v0, s4, s5, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> , <2 x i16> , i32 %c, i1 false) ret i32 %r @@ -168,29 +154,21 @@ ; GFX906-LABEL: v_sdot2_inline_literal_a_b_c: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4 -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX906-NEXT: v_mov_b32_e32 v0, s5 -; GFX906-NEXT: v_dot2_i32_i16 v0, s4, v0, 8 +; GFX906-NEXT: v_dot2_i32_i16 v0, 8, 4, 8 op_sel_hi:[0,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_inline_literal_a_b_c: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4 -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX908-NEXT: v_mov_b32_e32 v0, s5 -; GFX908-NEXT: v_dot2_i32_i16 v0, s4, v0, 8 +; GFX908-NEXT: v_dot2_i32_i16 v0, 8, 4, 8 op_sel_hi:[0,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_inline_literal_a_b_c: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, 4, 4 +; GFX10-NEXT: v_dot2_i32_i16 v0, 8, 4, 8 op_sel_hi:[0,0,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_dot2_i32_i16 v0, s4, s5, 8 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> , <2 x i16> , i32 8, i1 false) ret i32 %r Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll @@ -82,24 +82,21 @@ ; GFX906-LABEL: v_udot2_inline_literal_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX906-NEXT: v_dot2_u32_u16 v0, s4, v0, v1 +; GFX906-NEXT: v_dot2_u32_u16 v0, 4, v0, v1 op_sel_hi:[0,1,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_inline_literal_a: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX908-NEXT: v_dot2_u32_u16 v0, s4, v0, v1 +; GFX908-NEXT: v_dot2_u32_u16 v0, 4, v0, v1 op_sel_hi:[0,1,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_inline_literal_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 4, 4 +; GFX10-NEXT: v_dot2_u32_u16 v0, 4, v0, v1 op_sel_hi:[0,1,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_dot2_u32_u16 v0, s4, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.udot2(<2 x i16> , <2 x i16> %b, i32 %c, i1 false) ret i32 %r @@ -109,24 +106,21 @@ ; GFX906-LABEL: v_udot2_inline_literal_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX906-NEXT: v_dot2_u32_u16 v0, v0, s4, v1 +; GFX906-NEXT: v_dot2_u32_u16 v0, v0, 4, v1 op_sel_hi:[1,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_inline_literal_b: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX908-NEXT: v_dot2_u32_u16 v0, v0, s4, v1 +; GFX908-NEXT: v_dot2_u32_u16 v0, v0, 4, v1 op_sel_hi:[1,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_inline_literal_b: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 4, 4 +; GFX10-NEXT: v_dot2_u32_u16 v0, v0, 4, v1 op_sel_hi:[1,0,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_dot2_u32_u16 v0, v0, s4, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> , i32 %c, i1 false) ret i32 %r @@ -136,29 +130,21 @@ ; GFX906-LABEL: v_udot2_inline_literal_a_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4 -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX906-NEXT: v_mov_b32_e32 v0, s5 -; GFX906-NEXT: v_dot2_u32_u16 v0, s4, v0, v1 +; GFX906-NEXT: v_dot2_u32_u16 v0, 8, 4, v1 op_sel_hi:[0,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_inline_literal_a_b: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4 -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX908-NEXT: v_mov_b32_e32 v0, s5 -; GFX908-NEXT: v_dot2_u32_u16 v0, s4, v0, v1 +; GFX908-NEXT: v_dot2_u32_u16 v0, 8, 4, v1 op_sel_hi:[0,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_inline_literal_a_b: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, 4, 4 +; GFX10-NEXT: v_dot2_u32_u16 v0, 8, 4, v1 op_sel_hi:[0,0,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_dot2_u32_u16 v0, s4, s5, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.udot2(<2 x i16> , <2 x i16> , i32 %c, i1 false) ret i32 %r @@ -168,29 +154,21 @@ ; GFX906-LABEL: v_udot2_inline_literal_a_b_c: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4 -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX906-NEXT: v_mov_b32_e32 v0, s5 -; GFX906-NEXT: v_dot2_u32_u16 v0, s4, v0, 8 +; GFX906-NEXT: v_dot2_u32_u16 v0, 8, 4, 8 op_sel_hi:[0,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_inline_literal_a_b_c: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4 -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX908-NEXT: v_mov_b32_e32 v0, s5 -; GFX908-NEXT: v_dot2_u32_u16 v0, s4, v0, 8 +; GFX908-NEXT: v_dot2_u32_u16 v0, 8, 4, 8 op_sel_hi:[0,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_inline_literal_a_b_c: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, 4, 4 +; GFX10-NEXT: v_dot2_u32_u16 v0, 8, 4, 8 op_sel_hi:[0,0,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_dot2_u32_u16 v0, s4, s5, 8 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.udot2(<2 x i16> , <2 x i16> , i32 8, i1 false) ret i32 %r Index: llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -533,8 +533,7 @@ ; GFX9-LABEL: v_lshr_v2i16_15: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, 15, 15 -; GFX9-NEXT: v_pk_lshrrev_b16 v0, s4, v0 +; GFX9-NEXT: v_pk_lshrrev_b16 v0, 15, v0 op_sel_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = lshr <2 x i16> %value, ret <2 x i16> %result Index: llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -529,8 +529,7 @@ ; GFX9-LABEL: v_shl_v2i16_15: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, 15, 15 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, s4, v0 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, 15, v0 op_sel_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = shl <2 x i16> %value, ret <2 x i16> %result Index: llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll @@ -50,16 +50,14 @@ ; ; GFX900-LABEL: scalar_xnor_v2i16_one_use: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_pack_ll_b32_b16 s2, -1, -1 ; GFX900-NEXT: s_xor_b32 s0, s0, s1 -; GFX900-NEXT: s_xor_b32 s0, s0, s2 +; GFX900-NEXT: s_xor_b32 s0, s0, -1 ; GFX900-NEXT: ; return to shader part epilog ; ; GFX906-LABEL: scalar_xnor_v2i16_one_use: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_pack_ll_b32_b16 s2, -1, -1 ; GFX906-NEXT: s_xor_b32 s0, s0, s1 -; GFX906-NEXT: s_xor_b32 s0, s0, s2 +; GFX906-NEXT: s_xor_b32 s0, s0, -1 ; GFX906-NEXT: ; return to shader part epilog entry: %xor = xor <2 x i16> %a, %b @@ -150,7 +148,7 @@ ; ; GFX900-LABEL: scalar_xnor_v4i16_one_use: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_pack_ll_b32_b16 s4, -1, -1 +; GFX900-NEXT: s_mov_b32 s4, -1 ; GFX900-NEXT: s_mov_b32 s5, s4 ; GFX900-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX900-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] @@ -158,7 +156,7 @@ ; ; GFX906-LABEL: scalar_xnor_v4i16_one_use: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_pack_ll_b32_b16 s4, -1, -1 +; GFX906-NEXT: s_mov_b32 s4, -1 ; GFX906-NEXT: s_mov_b32 s5, s4 ; GFX906-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX906-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]