diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -415,6 +415,14 @@ bool matchNotCmp(MachineInstr &MI, SmallVectorImpl &RegsToNegate); bool applyNotCmp(MachineInstr &MI, SmallVectorImpl &RegsToNegate); + bool matchConsecutiveInsertVecElts(MachineInstr &MI, + SmallVector &MatchInfo); + + bool applyBuildVecFromRegs(MachineInstr &MI, + SmallVector &MatchInfo); + + bool matchInsertVecEltBuildVec(MachineInstr &MI, + SmallVector &MatchInfo); /// Try to transform \p MI by using all of the above /// combine functions. Returns true if changed. bool tryCombine(MachineInstr &MI); diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h --- a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h @@ -396,6 +396,35 @@ inline CheckType m_SpecificType(LLT Ty) { return Ty; } +template +struct TrinaryOp_match { + Src0Ty Src0; + Src1Ty Src1; + Src2Ty Src2; + + TrinaryOp_match(const Src0Ty &Src0, const Src1Ty &Src1, const Src2Ty &Src2) + : Src0(Src0), Src1(Src1), Src2(Src2) {} + template + bool match(const MachineRegisterInfo &MRI, OpTy &&Op) { + MachineInstr *TmpMI; + if (mi_match(Op, MRI, m_MInstr(TmpMI))) { + if (TmpMI->getOpcode() == Opcode && TmpMI->getNumOperands() == 4) { + return (Src0.match(MRI, TmpMI->getOperand(1).getReg()) && + Src1.match(MRI, TmpMI->getOperand(2).getReg()) && + Src2.match(MRI, TmpMI->getOperand(3).getReg())); + } + } + return false; + } +}; +template +inline TrinaryOp_match +m_GInsertVecElt(const Src0Ty &Src0, const Src1Ty &Src1, const Src2Ty &Src2) { + return TrinaryOp_match(Src0, Src1, Src2); +} + } // namespace GMIPatternMatch } // namespace llvm diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -480,6 +480,22 @@ (apply [{ return Helper.applyCombineMulByNegativeOne(*${root}); }]) >; +def regs_small_vec : GIDefMatchData<"SmallVector">; +def combine_consecutive_insert_vec_elts : GICombineRule< + (defs root:$root, regs_small_vec:$info), + (match (wip_match_opcode G_INSERT_VECTOR_ELT):$root, + [{ return Helper.matchConsecutiveInsertVecElts(*${root}, ${info}); }]), + (apply [{ return Helper.applyBuildVecFromRegs(*${root}, ${info}); }])>; + +def combine_ins_vec_build_vec : GICombineRule< + (defs root:$root, regs_small_vec:$info), + (match (wip_match_opcode G_INSERT_VECTOR_ELT):$root, + [{ return Helper.matchInsertVecEltBuildVec(*${root}, ${info}); }]), + (apply [{ return Helper.applyBuildVecFromRegs(*${root}, ${info}); }])>; + +def insert_vec_elt_combines : GICombineGroup<[combine_consecutive_insert_vec_elts, + combine_ins_vec_build_vec]>; + // FIXME: These should use the custom predicate feature once it lands. def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero, undef_to_negative_one, @@ -505,7 +521,7 @@ def trivial_combines : GICombineGroup<[copy_prop, mul_to_shl, add_p2i_to_ptradd, mul_by_neg_one]>; -def all_combines : GICombineGroup<[trivial_combines, ptr_add_immed_chain, +def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines, ptr_add_immed_chain, combines_for_extload, combine_indexed_load_store, undef_combines, identity_combines, simplify_add_to_sub, hoist_logic_op_with_same_opcode_hands, diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -2403,6 +2403,74 @@ return CheckFold(LHS, RHS) || CheckFold(RHS, LHS); } +bool CombinerHelper::matchConsecutiveInsertVecElts( + MachineInstr &MI, SmallVector &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT && + "Invalid opcode"); + Register DstReg = MI.getOperand(0).getReg(); + LLT DstTy = MRI.getType(DstReg); + assert(DstTy.isVector() && "Invalid G_INSERT_VECTOR_ELT?"); + unsigned NumElts = DstTy.getNumElements(); + auto Index = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); + if (!Index || *Index != (NumElts - 1)) + return false; + SmallVector, 4> Inserts; + // Try to collect various InsertElts. + MachineInstr *CurrInst = &MI; + MachineInstr *TmpInst; + int64_t IntImm; + Register TmpReg; + while (mi_match( + CurrInst->getOperand(0).getReg(), MRI, + m_GInsertVecElt(m_MInstr(TmpInst), m_Reg(TmpReg), m_ICst(IntImm)))) { + Inserts.push_back(std::make_pair(TmpReg, IntImm)); + CurrInst = TmpInst; + } + + // Make sure we have correct no of elements. + if (Inserts.size() != NumElts) + return false; + SmallVector SeenIndex(NumElts, false); + std::sort(Inserts.begin(), Inserts.end(), + [](const std::pair &P1, + const std::pair &P2) { + return P1.second < P2.second; + }); + for (unsigned i = 0; i < NumElts; ++i) { + if (Inserts[i].second != i) + return false; + } + for (auto &P : Inserts) + MatchInfo.push_back(P.first); + return true; +} + +bool CombinerHelper::applyBuildVecFromRegs( + MachineInstr &MI, SmallVector &MatchInfo) { + Builder.setInstr(MI); + Builder.buildBuildVector(MI.getOperand(0).getReg(), MatchInfo); + MI.eraseFromParent(); + return true; +} + +bool CombinerHelper::matchInsertVecEltBuildVec( + MachineInstr &MI, SmallVector &MatchInfo) { + Register SrcVecReg = MI.getOperand(1).getReg(); + MachineInstr *SrcMI = MRI.getVRegDef(SrcVecReg); + Register EltReg = MI.getOperand(2).getReg(); + Register IdxReg = MI.getOperand(3).getReg(); + auto IdxCst = getConstantVRegVal(IdxReg, MRI); + if (!IdxCst) + return false; + if (SrcMI->getOpcode() == TargetOpcode::G_BUILD_VECTOR) { + for (auto &Op : SrcMI->uses()) + MatchInfo.push_back(Op.getReg()); + MatchInfo[*IdxCst] = EltReg; + return true; + } + return false; +} + bool CombinerHelper::applySimplifyAddToSub( MachineInstr &MI, std::tuple &MatchInfo) { Builder.setInstr(MI); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-insert-vec-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-insert-vec-elt.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-insert-vec-elt.mir @@ -0,0 +1,43 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -o - -march=aarch64 -run-pass=aarch64-prelegalizer-combiner %s | FileCheck %s +--- +name: test_combine_consecutive +body: | + bb.1: + liveins: $w0, $w1 + + ; CHECK-LABEL: name: test_combine_consecutive + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32) + ; CHECK: $x0 = COPY [[BUILD_VECTOR]](<2 x s32>) + %0:_(s32) = COPY $w0 + %1:_(s32) = COPY $w1 + %2:_(<2 x s32>) = G_IMPLICIT_DEF + %7:_(s32) = G_CONSTANT i32 0 + %8:_(s32) = G_CONSTANT i32 1 + %3:_(<2 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %7(s32) + %4:_(<2 x s32>) = G_INSERT_VECTOR_ELT %3, %1(s32), %8(s32) + $x0 = COPY %4 +... +--- +name: test_combine_insert_vec_build_vec_idx_1 +body: | + bb.1: + liveins: $w0, $w1, $w2, $w3 + + ; CHECK-LABEL: name: test_combine_insert_vec_build_vec_idx_1 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w2 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $w3 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; CHECK: $q0 = COPY [[BUILD_VECTOR]](<4 x s32>) + %0:_(s32) = COPY $w0 + %1:_(s32) = COPY $w1 + %6:_(s32) = COPY $w2 + %7:_(s32) = COPY $w3 + %2:_(<4 x s32>) = G_BUILD_VECTOR %0, %1, %6, %7 + %3:_(s32) = G_CONSTANT i32 1 + %4:_(<4 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32) + $q0 = COPY %4 +... diff --git a/llvm/test/CodeGen/AArch64/combine-loads.ll b/llvm/test/CodeGen/AArch64/combine-loads.ll --- a/llvm/test/CodeGen/AArch64/combine-loads.ll +++ b/llvm/test/CodeGen/AArch64/combine-loads.ll @@ -4,11 +4,10 @@ define <2 x i64> @z(i64* nocapture nonnull readonly %p) { ; CHECK-LABEL: z: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI0_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI0_0] ; CHECK-NEXT: ldr x9, [x0] ; CHECK-NEXT: ldr x8, [x0, #8] -; CHECK-NEXT: mov v0.d[0], x9 +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: fmov d0, x9 ; CHECK-NEXT: mov v0.d[1], x8 ; CHECK-NEXT: ret %b = load i64, i64* %p diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -3385,34 +3385,34 @@ ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: s_mov_b32 s6, 0xffffff -; CGP-NEXT: v_mov_b32_e32 v1, 0 ; CGP-NEXT: v_and_b32_e32 v0, s6, v0 -; CGP-NEXT: v_and_b32_e32 v2, s6, v2 -; CGP-NEXT: v_and_b32_e32 v3, s6, v4 -; CGP-NEXT: v_and_b32_e32 v4, s6, v6 +; CGP-NEXT: v_and_b32_e32 v1, s6, v2 +; CGP-NEXT: v_and_b32_e32 v2, s6, v4 +; CGP-NEXT: v_and_b32_e32 v3, s6, v6 ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v0 -; CGP-NEXT: v_cvt_f32_u32_e32 v3, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v4, v4 +; CGP-NEXT: v_cvt_f32_u32_e32 v1, v1 +; CGP-NEXT: v_cvt_f32_u32_e32 v3, v3 +; CGP-NEXT: v_rcp_f32_e32 v4, v2 ; CGP-NEXT: v_rcp_f32_e32 v5, v3 -; CGP-NEXT: v_rcp_f32_e32 v6, v4 -; CGP-NEXT: v_mul_f32_e32 v5, v0, v5 -; CGP-NEXT: v_mul_f32_e32 v6, v2, v6 +; CGP-NEXT: v_mul_f32_e32 v4, v0, v4 +; CGP-NEXT: v_mul_f32_e32 v5, v1, v5 +; CGP-NEXT: v_trunc_f32_e32 v4, v4 ; CGP-NEXT: v_trunc_f32_e32 v5, v5 -; CGP-NEXT: v_trunc_f32_e32 v6, v6 -; CGP-NEXT: v_mad_f32 v0, -v5, v3, v0 +; CGP-NEXT: v_mad_f32 v0, -v4, v2, v0 +; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 +; CGP-NEXT: v_mad_f32 v1, -v5, v3, v1 ; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CGP-NEXT: v_mad_f32 v2, -v6, v4, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v0|, v3 +; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v0|, v2 ; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, v4 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 +; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, v3 +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 ; CGP-NEXT: v_and_b32_e32 v0, s6, v0 -; CGP-NEXT: v_and_b32_e32 v2, s6, v2 -; CGP-NEXT: v_mov_b32_e32 v3, v1 +; CGP-NEXT: v_and_b32_e32 v2, s6, v1 +; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: s_setpc_b64 s[30:31] %num.mask = and <2 x i64> %num, %den.mask = and <2 x i64> %den, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -3336,38 +3336,38 @@ ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: s_mov_b32 s6, 0xffffff -; CGP-NEXT: v_mov_b32_e32 v1, 0 ; CGP-NEXT: v_and_b32_e32 v0, s6, v0 -; CGP-NEXT: v_and_b32_e32 v2, s6, v2 -; CGP-NEXT: v_and_b32_e32 v3, s6, v4 -; CGP-NEXT: v_and_b32_e32 v4, s6, v6 -; CGP-NEXT: v_cvt_f32_u32_e32 v5, v0 -; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 -; CGP-NEXT: v_cvt_f32_u32_e32 v7, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v8, v4 -; CGP-NEXT: v_rcp_f32_e32 v9, v6 -; CGP-NEXT: v_rcp_f32_e32 v10, v8 -; CGP-NEXT: v_mul_f32_e32 v9, v5, v9 -; CGP-NEXT: v_mul_f32_e32 v10, v7, v10 +; CGP-NEXT: v_and_b32_e32 v1, s6, v2 +; CGP-NEXT: v_and_b32_e32 v2, s6, v4 +; CGP-NEXT: v_and_b32_e32 v3, s6, v6 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, v0 +; CGP-NEXT: v_cvt_f32_u32_e32 v5, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v6, v1 +; CGP-NEXT: v_cvt_f32_u32_e32 v7, v3 +; CGP-NEXT: v_rcp_f32_e32 v8, v5 +; CGP-NEXT: v_rcp_f32_e32 v9, v7 +; CGP-NEXT: v_mul_f32_e32 v8, v4, v8 +; CGP-NEXT: v_mul_f32_e32 v9, v6, v9 +; CGP-NEXT: v_trunc_f32_e32 v8, v8 ; CGP-NEXT: v_trunc_f32_e32 v9, v9 -; CGP-NEXT: v_trunc_f32_e32 v10, v10 -; CGP-NEXT: v_mad_f32 v5, -v9, v6, v5 +; CGP-NEXT: v_mad_f32 v4, -v8, v5, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_mad_f32 v6, -v9, v7, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v9, v9 -; CGP-NEXT: v_mad_f32 v7, -v10, v8, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v10, v10 -; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, v6 +; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, v5 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v6|, v7 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v7|, v8 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6 +; CGP-NEXT: v_mul_lo_u32 v2, v4, v2 ; CGP-NEXT: v_mul_lo_u32 v3, v5, v3 -; CGP-NEXT: v_mul_lo_u32 v4, v6, v4 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 ; CGP-NEXT: v_and_b32_e32 v0, s6, v0 -; CGP-NEXT: v_and_b32_e32 v2, s6, v2 -; CGP-NEXT: v_mov_b32_e32 v3, v1 +; CGP-NEXT: v_and_b32_e32 v2, s6, v1 +; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: s_setpc_b64 s[30:31] %num.mask = and <2 x i64> %num, %den.mask = and <2 x i64> %den, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll @@ -154,15 +154,15 @@ define amdgpu_ps <2 x i64> @scalar_xnor_i64_mul_use(i64 inreg %a, i64 inreg %b) { ; GCN-LABEL: scalar_xnor_i64_mul_use: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] -; GCN-NEXT: s_not_b64 s[0:1], s[2:3] -; GCN-NEXT: s_add_u32 s2, s2, s4 -; GCN-NEXT: s_cselect_b32 s4, 1, 0 -; GCN-NEXT: s_and_b32 s4, s4, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_addc_u32 s3, s3, s5 +; GCN-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3] +; GCN-NEXT: s_not_b64 s[4:5], s[2:3] +; GCN-NEXT: s_add_u32 s2, s2, s0 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, s0, 1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_addc_u32 s3, s3, s1 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: ; return to shader part epilog %xor = xor i64 %a, %b %r0.val = xor i64 %xor, -1