diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -751,6 +751,8 @@ /// Transform G_ADD(G_SUB(y, x), x) to y. bool matchAddSubSameReg(MachineInstr &MI, Register &Src); + bool matchBuildVectorIdentityFold(MachineInstr &MI, Register &MatchInfo); + /// \returns true if it is possible to simplify a select instruction \p MI /// to a min/max instruction of some sort. bool matchSimplifySelectToMinMax(MachineInstr &MI, BuildFnTy &MatchInfo); diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h --- a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h @@ -354,6 +354,17 @@ inline bind_ty m_Pred(CmpInst::Predicate &P) { return P; } inline operand_type_match m_Pred() { return operand_type_match(); } +struct ImplicitDefMatch { + bool match(const MachineRegisterInfo &MRI, Register Reg) { + MachineInstr *TmpMI; + if (mi_match(Reg, MRI, m_MInstr(TmpMI))) + return TmpMI->getOpcode() == TargetOpcode::G_IMPLICIT_DEF; + return false; + } +}; + +inline ImplicitDefMatch m_GImplicitDef() { return ImplicitDefMatch(); } + // Helper for matching G_FCONSTANT inline bind_ty m_GFCst(const ConstantFP *&C) { return C; } @@ -423,6 +434,19 @@ return BinaryOp_match(L, R); } +template +inline BinaryOp_match +m_GBuildVector(const LHS &L, const RHS &R) { + return BinaryOp_match(L, R); +} + +template +inline BinaryOp_match +m_GBuildVectorTrunc(const LHS &L, const RHS &R) { + return BinaryOp_match(L, + R); +} + template inline BinaryOp_match m_GPtrAdd(const LHS &L, const RHS &R) { diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -932,6 +932,12 @@ (apply [{ return Helper.replaceSingleDefInstWithReg(*${root}, ${matchinfo}); }])>; +def buildvector_identity_fold : GICombineRule< + (defs root:$build_vector, register_matchinfo:$matchinfo), + (match (wip_match_opcode G_BUILD_VECTOR_TRUNC, G_BUILD_VECTOR):$build_vector, + [{ return Helper.matchBuildVectorIdentityFold(*${build_vector}, ${matchinfo}); }]), + (apply [{ Helper.replaceSingleDefInstWithReg(*${build_vector}, ${matchinfo}); }])>; + def select_to_minmax: GICombineRule< (defs root:$root, build_fn_matchinfo:$info), (match (wip_match_opcode G_SELECT):$root, @@ -955,7 +961,7 @@ binop_right_to_zero, p2i_to_i2p, i2p_to_p2i, anyext_trunc_fold, fneg_fneg_fold, right_identity_one, - add_sub_reg]>; + add_sub_reg, buildvector_identity_fold]>; def const_combines : GICombineGroup<[constant_fp_op, const_ptradd_to_i2p, overlapping_and, mulo_by_2, mulo_by_0, diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -5800,6 +5800,52 @@ return CheckFold(LHS, RHS) || CheckFold(RHS, LHS); } +bool CombinerHelper::matchBuildVectorIdentityFold(MachineInstr &MI, + Register &MatchInfo) { + // This combine folds the following patterns: + // + // G_BUILD_VECTOR_TRUNC (G_BITCAST(x), G_LSHR(G_BITCAST(x), k)) + // G_BUILD_VECTOR(G_TRUNC(G_BITCAST(x)), G_TRUNC(G_LSHR(G_BITCAST(x), k))) + // into + // x + // if + // k == sizeof(VecEltTy)/2 + // type(x) == type(dst) + // + // G_BUILD_VECTOR(G_TRUNC(G_BITCAST(x)), undef) + // into + // x + // if + // type(x) == type(dst) + + LLT DstVecTy = MRI.getType(MI.getOperand(0).getReg()); + LLT DstEltTy = DstVecTy.getElementType(); + + Register Lo, Hi; + + if (mi_match( + MI, MRI, + m_GBuildVector(m_GTrunc(m_GBitcast(m_Reg(Lo))), m_GImplicitDef()))) { + MatchInfo = Lo; + return MRI.getType(MatchInfo) == DstVecTy; + } + + Optional ShiftAmount; + const auto LoPattern = m_GBitcast(m_Reg(Lo)); + const auto HiPattern = m_GLShr(m_GBitcast(m_Reg(Hi)), m_GCst(ShiftAmount)); + if (mi_match( + MI, MRI, + m_any_of(m_GBuildVectorTrunc(LoPattern, HiPattern), + m_GBuildVector(m_GTrunc(LoPattern), m_GTrunc(HiPattern))))) { + if (Lo == Hi && ShiftAmount->Value == DstEltTy.getSizeInBits()) { + MatchInfo = Lo; + return MRI.getType(MatchInfo) == DstVecTy; + } + } + + return false; +} + unsigned CombinerHelper::getFPMinMaxOpcForSelect( CmpInst::Predicate Pred, LLT DstTy, SelectPatternNaNBehaviour VsNaNRetVal) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -45,6 +45,12 @@ [{ return PostLegalizerHelper.matchCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }]), (apply [{ PostLegalizerHelper.applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>; +def trunc_buildvector_fold : GICombineRule< + (defs root:$op, register_matchinfo:$matchinfo), + (match (wip_match_opcode G_TRUNC):$op, + [{ return PostLegalizerHelper.matchTruncBuildVectorFold(*${op}, ${matchinfo}); }]), + (apply [{ Helper.replaceSingleDefInstWithReg(*${op}, ${matchinfo}); }])>; + def clamp_i64_to_i16_matchdata : GIDefMatchData<"AMDGPUPreLegalizerCombinerHelper::ClampI64ToI16MatchInfo">; def clamp_i64_to_i16 : GICombineRule< @@ -119,7 +125,7 @@ "AMDGPUGenPostLegalizerCombinerHelper", [all_combines, gfx6gfx7_combines, uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg, - rcp_sqrt_to_rsq]> { + rcp_sqrt_to_rsq, trunc_buildvector_fold]> { let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule"; let StateClass = "AMDGPUPostLegalizerCombinerHelperState"; let AdditionalArguments = []; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -74,6 +74,7 @@ const CvtF32UByteMatchInfo &MatchInfo); bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg); + bool matchTruncBuildVectorFold(MachineInstr &MI, Register &MatchInfo); }; bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy( @@ -303,6 +304,19 @@ return TLI->isCanonicalized(Reg, MF); } +bool AMDGPUPostLegalizerCombinerHelper::matchTruncBuildVectorFold( + MachineInstr &MI, Register &MatchInfo) { + assert(MI.getOpcode() == AMDGPU::G_TRUNC); + + // Replace (G_TRUNC (G_BITCAST (G_BUILD_VECTOR x, y)) with just x + // if type(x) == type(G_TRUNC) + if (!mi_match(MI.getOperand(1).getReg(), MRI, + m_GBitcast(m_GBuildVector(m_Reg(MatchInfo), m_Reg())))) + return false; + + return MRI.getType(MatchInfo) == MRI.getType(MI.getOperand(0).getReg()); +} + class AMDGPUPostLegalizerCombinerHelperState { protected: AMDGPUCombinerHelper &Helper; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll @@ -50,10 +50,6 @@ define amdgpu_vs <5 x float> @test_5xf16_5xf32_add_ext_mul(<5 x half> inreg %x, <5 x half> inreg %y, <5 x float> inreg %z) { ; GFX9-FAST-DENORM-LABEL: test_5xf16_5xf32_add_ext_mul: ; GFX9-FAST-DENORM: ; %bb.0: ; %.entry -; GFX9-FAST-DENORM-NEXT: s_pack_lh_b32_b16 s3, s3, s3 -; GFX9-FAST-DENORM-NEXT: s_pack_lh_b32_b16 s4, s4, s4 -; GFX9-FAST-DENORM-NEXT: s_pack_lh_b32_b16 s0, s0, s0 -; GFX9-FAST-DENORM-NEXT: s_pack_lh_b32_b16 s1, s1, s1 ; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll @@ -786,177 +786,69 @@ ; GFX9-LABEL: test_3xhalf_add_mul_rhs: ; GFX9: ; %bb.0: ; %.entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v2, v7, 16, v2 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v4 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v4 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v3, v8, 16, v3 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX9-NEXT: v_pk_add_f16 v0, v3, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_pk_add_f16 v0, v4, v0 ; GFX9-NEXT: v_pk_add_f16 v1, v5, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-CONTRACT-LABEL: test_3xhalf_add_mul_rhs: ; GFX9-CONTRACT: ; %bb.0: ; %.entry ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-CONTRACT-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-CONTRACT-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX9-CONTRACT-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX9-CONTRACT-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-CONTRACT-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-CONTRACT-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX9-CONTRACT-NEXT: v_lshl_or_b32 v0, v6, 16, v0 -; GFX9-CONTRACT-NEXT: v_lshl_or_b32 v2, v7, 16, v2 -; GFX9-CONTRACT-NEXT: v_lshl_or_b32 v4, v8, 16, v4 ; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX9-CONTRACT-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-CONTRACT-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5 -; GFX9-CONTRACT-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-DENORM-LABEL: test_3xhalf_add_mul_rhs: ; GFX9-DENORM: ; %bb.0: ; %.entry ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX9-DENORM-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-DENORM-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-DENORM-NEXT: v_lshl_or_b32 v0, v6, 16, v0 -; GFX9-DENORM-NEXT: v_lshl_or_b32 v2, v7, 16, v2 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v8, 16, v4 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-DENORM-NEXT: v_and_b32_e32 v3, 0xffff, v4 -; GFX9-DENORM-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-DENORM-NEXT: v_lshl_or_b32 v3, v8, 16, v3 -; GFX9-DENORM-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX9-DENORM-NEXT: v_pk_add_f16 v0, v3, v0 -; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-DENORM-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-DENORM-NEXT: v_pk_add_f16 v0, v4, v0 ; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v5, v1 -; GFX9-DENORM-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-UNSAFE-LABEL: test_3xhalf_add_mul_rhs: ; GFX9-UNSAFE: ; %bb.0: ; %.entry ; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-UNSAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-UNSAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX9-UNSAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX9-UNSAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-UNSAFE-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-UNSAFE-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX9-UNSAFE-NEXT: v_lshl_or_b32 v0, v6, 16, v0 -; GFX9-UNSAFE-NEXT: v_lshl_or_b32 v2, v7, 16, v2 -; GFX9-UNSAFE-NEXT: v_lshl_or_b32 v4, v8, 16, v4 ; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX9-UNSAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-UNSAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5 -; GFX9-UNSAFE-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_3xhalf_add_mul_rhs: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX10-NEXT: v_lshl_or_b32 v0, v6, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v2, v7, 16, v2 +; GFX10-NEXT: v_pk_add_f16 v0, v4, v0 ; GFX10-NEXT: v_pk_add_f16 v1, v5, v1 -; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v0, v6, 16, v0 -; GFX10-NEXT: v_pk_add_f16 v0, v2, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-CONTRACT-LABEL: test_3xhalf_add_mul_rhs: ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX10-CONTRACT-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-CONTRACT-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-CONTRACT-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5 -; GFX10-CONTRACT-NEXT: v_lshl_or_b32 v0, v6, 16, v0 -; GFX10-CONTRACT-NEXT: v_lshl_or_b32 v2, v7, 16, v2 -; GFX10-CONTRACT-NEXT: v_lshl_or_b32 v4, v8, 16, v4 ; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX10-CONTRACT-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-CONTRACT-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5 ; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-DENORM-LABEL: test_3xhalf_add_mul_rhs: ; GFX10-DENORM: ; %bb.0: ; %.entry ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX10-DENORM-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-DENORM-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX10-DENORM-NEXT: v_lshl_or_b32 v0, v6, 16, v0 -; GFX10-DENORM-NEXT: v_lshl_or_b32 v2, v7, 16, v2 +; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v4, v0 ; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v5, v1 -; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX10-DENORM-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-DENORM-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-DENORM-NEXT: v_lshl_or_b32 v2, v2, 16, v4 -; GFX10-DENORM-NEXT: v_lshl_or_b32 v0, v6, 16, v0 -; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v2, v0 -; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX10-DENORM-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-DENORM-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-UNSAFE-LABEL: test_3xhalf_add_mul_rhs: ; GFX10-UNSAFE: ; %bb.0: ; %.entry ; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-UNSAFE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX10-UNSAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-UNSAFE-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-UNSAFE-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5 -; GFX10-UNSAFE-NEXT: v_lshl_or_b32 v0, v6, 16, v0 -; GFX10-UNSAFE-NEXT: v_lshl_or_b32 v2, v7, 16, v2 -; GFX10-UNSAFE-NEXT: v_lshl_or_b32 v4, v8, 16, v4 ; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX10-UNSAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-UNSAFE-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5 ; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul <3 x half> %x, %y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll @@ -2,8 +2,8 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8-UNPACKED %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8-PACKED %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS %s define amdgpu_ps half @load_1d_f16_x(<8 x i32> inreg %rsrc, i32 %s) { ; GFX8-UNPACKED-LABEL: load_1d_f16_x: @@ -546,45 +546,21 @@ ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 unorm d16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_1d_v3f16_xyz: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX10-NEXT: ; return to shader part epilog -; -; GFX11-LABEL: load_1d_v3f16_xyz: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_mov_b32 s0, s2 -; GFX11-NEXT: s_mov_b32 s1, s3 -; GFX11-NEXT: s_mov_b32 s2, s4 -; GFX11-NEXT: s_mov_b32 s3, s5 -; GFX11-NEXT: s_mov_b32 s4, s6 -; GFX11-NEXT: s_mov_b32 s5, s7 -; GFX11-NEXT: s_mov_b32 s6, s8 -; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm d16 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_1d_v3f16_xyz: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm d16 +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog %v = call <3 x half> @llvm.amdgcn.image.load.1d.v3f16.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <3 x half> %v } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-buildvector-identities.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-buildvector-identities.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-buildvector-identities.mir @@ -0,0 +1,157 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs -o - %s | FileCheck %s + +--- +name: v2s16_trunc_same_bitcast_lshr16 +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: v2s16_trunc_same_bitcast_lshr16 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %src:_(<2 x s16>) = COPY $vgpr0 + ; CHECK-NEXT: $vgpr0 = COPY %src(<2 x s16>) + %src:_(<2 x s16>) = COPY $vgpr0 + %bitcast:_(s32) = G_BITCAST %src + %lshr_amount:_(s32) = G_CONSTANT i32 16 + %lshr:_(s32) = G_LSHR %bitcast, %lshr_amount + %root:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %bitcast, %lshr + $vgpr0 = COPY %root +... + +--- +name: v2s16_trunc_different_bitcast_lshr16 +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: v2s16_trunc_different_bitcast_lshr16 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %src:_(<2 x s16>) = COPY $vgpr0 + ; CHECK-NEXT: $vgpr0 = COPY %src(<2 x s16>) + %src:_(<2 x s16>) = COPY $vgpr0 + %bitcast0:_(s32) = G_BITCAST %src + %bitcast1:_(s32) = G_BITCAST %src + %lshr_amount:_(s32) = G_CONSTANT i32 16 + %lshr:_(s32) = G_LSHR %bitcast1, %lshr_amount + %root:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %bitcast0, %lshr + $vgpr0 = COPY %root +... + +--- +name: v2s16_trunc_same_bitcast_lshr8_nocombine +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: v2s16_trunc_same_bitcast_lshr8_nocombine + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %src:_(<2 x s16>) = COPY $vgpr0 + ; CHECK-NEXT: %bitcast:_(s32) = G_BITCAST %src(<2 x s16>) + ; CHECK-NEXT: %lshr_amount:_(s32) = G_CONSTANT i32 8 + ; CHECK-NEXT: %lshr:_(s32) = G_LSHR %bitcast, %lshr_amount(s32) + ; CHECK-NEXT: %root:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %bitcast(s32), %lshr(s32) + ; CHECK-NEXT: $vgpr0 = COPY %root(<2 x s16>) + %src:_(<2 x s16>) = COPY $vgpr0 + %bitcast:_(s32) = G_BITCAST %src + %lshr_amount:_(s32) = G_CONSTANT i32 8 + %lshr:_(s32) = G_LSHR %bitcast, %lshr_amount + %root:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %bitcast, %lshr + $vgpr0 = COPY %root +... + +--- +name: v2s16_same_bitcast_lshr16 +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: v2s16_same_bitcast_lshr16 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %src:_(<2 x s16>) = COPY $vgpr0 + ; CHECK-NEXT: $vgpr0 = COPY %src(<2 x s16>) + %src:_(<2 x s16>) = COPY $vgpr0 + %bitcast:_(s32) = G_BITCAST %src + %lshr_amount:_(s32) = G_CONSTANT i32 16 + %lshr:_(s32) = G_LSHR %bitcast, %lshr_amount + %trunclo:_(s16) = G_TRUNC %bitcast + %trunchi:_(s16) = G_TRUNC %lshr + %root:_(<2 x s16>) = G_BUILD_VECTOR %trunclo, %trunchi + $vgpr0 = COPY %root +... + +--- +name: v2s16_same_bitcast_lshr8_nocombine +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: v2s16_same_bitcast_lshr8_nocombine + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %src:_(<2 x s16>) = COPY $vgpr0 + ; CHECK-NEXT: %bitcast:_(s32) = G_BITCAST %src(<2 x s16>) + ; CHECK-NEXT: %lshr_amount:_(s32) = G_CONSTANT i32 8 + ; CHECK-NEXT: %lshr:_(s32) = G_LSHR %bitcast, %lshr_amount(s32) + ; CHECK-NEXT: %trunclo:_(s16) = G_TRUNC %bitcast(s32) + ; CHECK-NEXT: %trunchi:_(s16) = G_TRUNC %lshr(s32) + ; CHECK-NEXT: %root:_(<2 x s16>) = G_BUILD_VECTOR %trunclo(s16), %trunchi(s16) + ; CHECK-NEXT: $vgpr0 = COPY %root(<2 x s16>) + %src:_(<2 x s16>) = COPY $vgpr0 + %bitcast:_(s32) = G_BITCAST %src + %lshr_amount:_(s32) = G_CONSTANT i32 8 + %lshr:_(s32) = G_LSHR %bitcast, %lshr_amount + %trunclo:_(s16) = G_TRUNC %bitcast + %trunchi:_(s16) = G_TRUNC %lshr + %root:_(<2 x s16>) = G_BUILD_VECTOR %trunclo, %trunchi + $vgpr0 = COPY %root +... + +--- +name: v2s16_undefhi +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: v2s16_undefhi + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %src:_(<2 x s16>) = COPY $vgpr0 + ; CHECK-NEXT: $vgpr0 = COPY %src(<2 x s16>) + %src:_(<2 x s16>) = COPY $vgpr0 + %bitcast:_(s32) = G_BITCAST %src + %trunc:_(s16) = G_TRUNC %bitcast + %undef:_(s16) = G_IMPLICIT_DEF + %root:_(<2 x s16>) = G_BUILD_VECTOR %trunc, %undef + $vgpr0 = COPY %root +... + +--- +name: v2s32_undefhi +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: v2s32_undefhi + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %src:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY %src(<2 x s32>) + %src:_(<2 x s32>) = COPY $vgpr0_vgpr1 + %bitcast:_(s64) = G_BITCAST %src + %trunc:_(s32) = G_TRUNC %bitcast + %undef:_(s32) = G_IMPLICIT_DEF + %root:_(<2 x s32>) = G_BUILD_VECTOR %trunc, %undef + $vgpr0_vgpr1 = COPY %root +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-trunc-bitcast-buildvector.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-trunc-bitcast-buildvector.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-trunc-bitcast-buildvector.mir @@ -0,0 +1,106 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs -o - %s | FileCheck %s + +--- +name: s16_trunc_v2s16_buildvector +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; CHECK-LABEL: name: s16_trunc_v2s16_buildvector + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 42 + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[C]], [[TRUNC]] + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CHECK-NEXT: $vgpr0 = COPY [[ZEXT]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s16) = G_TRUNC %0 + %3:_(s16) = G_TRUNC %1 + %4:_(<2 x s16>) = G_BUILD_VECTOR %2, %3 + %5:_(s32) = G_BITCAST %4 + %6:_(s16) = G_TRUNC %5 + %7:_(s16) = G_CONSTANT i16 42 + %8:_(s16) = G_OR %7, %6 + %9:_(s32) = G_ZEXT %8 + $vgpr0 = COPY %9 +... + +--- +name: s16_trunc_v2s32_buildvector_nofold +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; CHECK-LABEL: name: s16_trunc_v2s32_buildvector_nofold + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s64) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 42 + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[C]], [[TRUNC]] + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CHECK-NEXT: $vgpr0 = COPY [[ZEXT]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(<2 x s32>) = G_BUILD_VECTOR %0, %1 + %3:_(s64) = G_BITCAST %2 + %4:_(s16) = G_TRUNC %3 + %5:_(s16) = G_CONSTANT i16 42 + %6:_(s16) = G_OR %5, %4 + %7:_(s32) = G_ZEXT %6 + $vgpr0 = COPY %7 +... + +--- +name: s32_trunc_v2s32_buildvector +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; CHECK-LABEL: name: s32_trunc_v2s32_buildvector + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(<2 x s32>) = G_BUILD_VECTOR %0, %1 + %3:_(s64) = G_BITCAST %2 + %4:_(s32) = G_TRUNC %3 + $vgpr0 = COPY %4 +... + +--- +name: s32_trunc_v2s32_buildvector_multiple_users +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; CHECK-LABEL: name: s32_trunc_v2s32_buildvector_multiple_users + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) + ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BUILD_VECTOR]](<2 x s32>), [[COPY1]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32) + ; CHECK-NEXT: $vgpr1 = COPY [[EVEC]](s32) + ; CHECK-NEXT: $vgpr2_vgpr3 = COPY [[BITCAST]](s64) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(<2 x s32>) = G_BUILD_VECTOR %0, %1 + %3:_(s64) = G_BITCAST %2 + %4:_(s32) = G_TRUNC %3 + %5:_(s32) = G_EXTRACT_VECTOR_ELT %2, %1 + $vgpr0 = COPY %4 + $vgpr1 = COPY %5 + $vgpr2_vgpr3 = COPY %3 +...