diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -355,6 +355,10 @@ /// \return true if \p MI is a G_SEXT_INREG that can be erased. bool matchRedundantSExtInReg(MachineInstr &MI); + /// Combine inverting a result of a compare into the opposite cond code. + bool matchNotCmp(MachineInstr &MI, Register &CmpReg); + bool applyNotCmp(MachineInstr &MI, Register &CmpReg); + /// Try to transform \p MI by using all of the above /// combine functions. Returns true if changed. bool tryCombine(MachineInstr &MI); diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h --- a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h @@ -233,6 +233,12 @@ return BinaryOp_match(L, R); } +template +inline BinaryOp_match +m_GXor(const LHS &L, const RHS &R) { + return BinaryOp_match(L, R); +} + template inline BinaryOp_match m_GOr(const LHS &L, const RHS &R) { diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h --- a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h @@ -16,6 +16,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/Register.h" +#include "llvm/CodeGen/TargetLowering.h" #include "llvm/Support/Alignment.h" #include "llvm/Support/LowLevelTypeImpl.h" #include "llvm/Support/MachineValueType.h" @@ -227,6 +228,10 @@ /// If \p MI is not a splat, returns None. Optional getSplatIndex(MachineInstr &MI); +/// Returns a scalar constant of a G_BUILD_VECTOR splat if it exists. +Optional getBuildVectorConstantSplat(const MachineInstr &MI, + const MachineRegisterInfo &MRI); + /// Return true if the specified instruction is a G_BUILD_VECTOR or /// G_BUILD_VECTOR_TRUNC where all of the elements are 0 or undef. bool isBuildVectorAllZeros(const MachineInstr &MI, @@ -237,5 +242,9 @@ bool isBuildVectorAllOnes(const MachineInstr &MI, const MachineRegisterInfo &MRI); +/// Returns true if given the TargetLowering's boolean contents information, +/// the value \p Val contains a true value. +bool isConstTrueVal(const TargetLowering &TLI, int64_t Val, bool IsVector, + bool IsFP); } // End namespace llvm. #endif diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -373,6 +373,14 @@ (apply [{ return Helper.applyCombineExtOfExt(*${root}, ${matchinfo}); }]) >; +def not_cmp_fold_matchinfo : GIDefMatchData<"Register">; +def not_cmp_fold : GICombineRule< + (defs root:$d, not_cmp_fold_matchinfo:$info), + (match (wip_match_opcode G_XOR): $d, + [{ return Helper.matchNotCmp(*${d}, ${info}); }]), + (apply [{ return Helper.applyNotCmp(*${d}, ${info}); }]) +>; + // FIXME: These should use the custom predicate feature once it lands. def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero, undef_to_negative_one, @@ -400,4 +408,5 @@ hoist_logic_op_with_same_opcode_hands, shl_ashr_to_sext_inreg, sext_inreg_of_load, width_reduction_combines, select_combines, - known_bits_simplifications, ext_ext_fold]>; + known_bits_simplifications, ext_ext_fold, + not_cmp_fold]>; diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -2231,6 +2231,74 @@ return KB->computeNumSignBits(Src) >= (TypeSize - ExtBits + 1); } +static bool isConstValidTrue(const TargetLowering &TLI, unsigned ScalarSizeBits, + int64_t Cst, bool IsVector, bool IsFP) { + // For i1, Cst will always be -1 regardless of boolean contents. + return (ScalarSizeBits == 1 && Cst == -1) || + isConstTrueVal(TLI, Cst, IsVector, IsFP); +} + +bool CombinerHelper::matchNotCmp(MachineInstr &MI, Register &CmpReg) { + assert(MI.getOpcode() == TargetOpcode::G_XOR); + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + const auto &TLI = *Builder.getMF().getSubtarget().getTargetLowering(); + Register XorSrc; + Register CstReg; + int64_t Cst; + // We match xor(src, true) here. + if (!mi_match(MI.getOperand(0).getReg(), MRI, + m_GXor(m_Reg(XorSrc), m_Reg(CstReg)))) + return false; + + if (!MRI.hasOneNonDBGUse(XorSrc)) + return false; + + // Now try match src to either icmp or fcmp. + bool IsFP = false; + if (!mi_match(XorSrc, MRI, m_GICmp(m_Pred(), m_Reg(), m_Reg()))) { + // Try fcmp. + if (!mi_match(XorSrc, MRI, m_GFCmp(m_Pred(), m_Reg(), m_Reg()))) + return false; + IsFP = true; + } + + if (Ty.isVector()) { + MachineInstr *CstDef = MRI.getVRegDef(CstReg); + auto MaybeCst = getBuildVectorConstantSplat(*CstDef, MRI); + if (!MaybeCst) + return false; + if (!isConstValidTrue(TLI, Ty.getScalarSizeInBits(), *MaybeCst, true, IsFP)) + return false; + } else { + if (!mi_match(CstReg, MRI, m_ICst(Cst))) + return false; + if (!isConstValidTrue(TLI, Ty.getSizeInBits(), Cst, false, IsFP)) + return false; + } + + CmpReg = XorSrc; + return true; +} + +bool CombinerHelper::applyNotCmp(MachineInstr &MI, Register &CmpReg) { + MachineInstr *CmpDef = MRI.getVRegDef(CmpReg); + assert(CmpDef && "Should have been given an MI reg"); + assert(CmpDef->getOpcode() == TargetOpcode::G_ICMP || + CmpDef->getOpcode() == TargetOpcode::G_FCMP); + + Observer.changingInstr(*CmpDef); + MachineOperand &PredOp = CmpDef->getOperand(1); + CmpInst::Predicate NewP = CmpInst::getInversePredicate( + (CmpInst::Predicate)PredOp.getPredicate()); + PredOp.setPredicate(NewP); + Observer.changedInstr(*CmpDef); + + replaceRegWith(MRI, MI.getOperand(0).getReg(), + CmpDef->getOperand(0).getReg()); + MI.eraseFromParent(); + return true; +} + bool CombinerHelper::tryCombine(MachineInstr &MI) { if (tryCombineCopy(MI)) return true; diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -11,6 +11,8 @@ #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/Twine.h" #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" @@ -694,6 +696,28 @@ return true; } +Optional +llvm::getBuildVectorConstantSplat(const MachineInstr &MI, + const MachineRegisterInfo &MRI) { + if (!isBuildVectorOp(MI.getOpcode())) + return None; + + const unsigned NumOps = MI.getNumOperands(); + Optional Scalar; + for (unsigned I = 1; I != NumOps; ++I) { + Register Element = MI.getOperand(I).getReg(); + int64_t ElementValue; + if (!mi_match(Element, MRI, m_ICst(ElementValue))) + return None; + if (!Scalar) + Scalar = ElementValue; + else if (*Scalar != ElementValue) + return None; + } + + return Scalar; +} + bool llvm::isBuildVectorAllZeros(const MachineInstr &MI, const MachineRegisterInfo &MRI) { return isBuildVectorConstantSplat(MI, MRI, 0); @@ -703,3 +727,16 @@ const MachineRegisterInfo &MRI) { return isBuildVectorConstantSplat(MI, MRI, -1); } + +bool llvm::isConstTrueVal(const TargetLowering &TLI, int64_t Val, bool IsVector, + bool IsFP) { + switch (TLI.getBooleanContents(IsVector, IsFP)) { + case TargetLowering::UndefinedBooleanContent: + return Val & 0x1; + case TargetLowering::ZeroOrOneBooleanContent: + return Val == 1; + case TargetLowering::ZeroOrNegativeOneBooleanContent: + return Val == -1; + } + llvm_unreachable("Invalid boolean contents"); +} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-invert-cmp.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-invert-cmp.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-invert-cmp.mir @@ -0,0 +1,163 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple aarch64-apple-ios -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombinerhelper-only-enable-rule="not_cmp_fold" %s -o - -verify-machineinstrs | FileCheck %s + +# Check that we fold an compare result inverted into just inverting the condition code. +--- +name: icmp +tracksRegLiveness: true +body: | + bb.1: + liveins: $x0 + + ; CHECK-LABEL: name: icmp + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[COPY]](s64), [[C]] + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s1) + ; CHECK: $w0 = COPY [[ANYEXT]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %0:_(s64) = COPY $x0 + %1:_(s64) = G_CONSTANT i64 1 + %2:_(s1) = G_CONSTANT i1 1 + %3:_(s1) = G_ICMP intpred(sgt), %0(s64), %1 + %4:_(s1) = G_XOR %3, %2 + %5:_(s32) = G_ANYEXT %4 + $w0 = COPY %5(s32) + RET_ReallyLR implicit $w0 +... +--- +name: fcmp +tracksRegLiveness: true +body: | + bb.1: + liveins: $x0 + + ; CHECK-LABEL: name: fcmp + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ule), [[COPY]](s64), [[C]] + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FCMP]](s1) + ; CHECK: $w0 = COPY [[ANYEXT]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %0:_(s64) = COPY $x0 + %1:_(s64) = G_CONSTANT i64 1 + %2:_(s1) = G_CONSTANT i1 1 + %3:_(s1) = G_FCMP floatpred(ogt), %0(s64), %1 + %4:_(s1) = G_XOR %3, %2 + %5:_(s32) = G_ANYEXT %4 + $w0 = COPY %5(s32) + RET_ReallyLR implicit $w0 +... +--- +name: icmp_not_xor_with_1 +tracksRegLiveness: true +body: | + bb.1: + liveins: $x0 + + ; CHECK-LABEL: name: icmp_not_xor_with_1 + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK: [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 false + ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY]](s64), [[C]] + ; CHECK: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP]], [[C1]] + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR]](s1) + ; CHECK: $w0 = COPY [[ANYEXT]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %0:_(s64) = COPY $x0 + %1:_(s64) = G_CONSTANT i64 1 + %2:_(s1) = G_CONSTANT i1 0 + %3:_(s1) = G_ICMP intpred(sgt), %0(s64), %1 + %4:_(s1) = G_XOR %3, %2 + %5:_(s32) = G_ANYEXT %4 + $w0 = COPY %5(s32) + RET_ReallyLR implicit $w0 +... +--- +name: icmp_not_xor_with_wrong_bool_contents +tracksRegLiveness: true +body: | + bb.1: + liveins: $x0 + + ; Even though bit 0 of the constant is 1, we require zero in the upper bits + ; for our aarch64's zero-or-one boolean contents. + ; CHECK-LABEL: name: icmp_not_xor_with_wrong_bool_contents + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 + ; CHECK: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY]](s64), [[C]] + ; CHECK: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[ICMP]], [[C1]] + ; CHECK: $w0 = COPY [[XOR]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %0:_(s64) = COPY $x0 + %1:_(s64) = G_CONSTANT i64 1 + %2:_(s32) = G_CONSTANT i32 7 + %3:_(s32) = G_ICMP intpred(sgt), %0(s64), %1 + %4:_(s32) = G_XOR %3, %2 + $w0 = COPY %4(s32) + RET_ReallyLR implicit $w0 +... +--- +name: icmp_multiple_use +tracksRegLiveness: true +body: | + bb.1: + liveins: $x0 + + ; CHECK-LABEL: name: icmp_multiple_use + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK: [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true + ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY]](s64), [[C]] + ; CHECK: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP]], [[C1]] + ; CHECK: %other_use:_(s1) = G_AND [[ICMP]], [[C1]] + ; CHECK: %other_use_ext:_(s32) = G_ANYEXT %other_use(s1) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR]](s1) + ; CHECK: $w0 = COPY [[ANYEXT]](s32) + ; CHECK: $w1 = COPY %other_use_ext(s32) + ; CHECK: RET_ReallyLR implicit $w0 + %0:_(s64) = COPY $x0 + %1:_(s64) = G_CONSTANT i64 1 + %2:_(s1) = G_CONSTANT i1 1 + %3:_(s1) = G_ICMP intpred(sgt), %0(s64), %1 + %4:_(s1) = G_XOR %3, %2 + %other_use:_(s1) = G_AND %3, %2 + %other_use_ext:_(s32) = G_ANYEXT %other_use(s1) + %5:_(s32) = G_ANYEXT %4 + $w0 = COPY %5(s32) + $w1 = COPY %other_use_ext + RET_ReallyLR implicit $w0 +... +--- +name: icmp_vector +tracksRegLiveness: true +body: | + bb.1: + liveins: $q0 + + ; CHECK-LABEL: name: icmp_vector + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK: %splat_op2:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32) + ; CHECK: [[ICMP:%[0-9]+]]:_(<4 x s1>) = G_ICMP intpred(sle), [[COPY]](<4 x s32>), %splat_op2 + ; CHECK: [[ANYEXT:%[0-9]+]]:_(<4 x s32>) = G_ANYEXT [[ICMP]](<4 x s1>) + ; CHECK: $q0 = COPY [[ANYEXT]](<4 x s32>) + ; CHECK: RET_ReallyLR implicit $q0 + %0:_(<4 x s32>) = COPY $q0 + %1:_(s32) = G_CONSTANT i32 5 + %splat_op2:_(<4 x s32>) = G_BUILD_VECTOR %1, %1, %1, %1 + %2:_(s1) = G_CONSTANT i1 1 + %splat_true:_(<4 x s1>) = G_BUILD_VECTOR %2, %2, %2, %2 + %3:_(<4 x s1>) = G_ICMP intpred(sgt), %0(<4 x s32>), %splat_op2 + %4:_(<4 x s1>) = G_XOR %3, %splat_true + %5:_(<4 x s32>) = G_ANYEXT %4 + $q0 = COPY %5(<4 x s32>) + RET_ReallyLR implicit $q0 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll @@ -136,27 +136,24 @@ ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_lg_u32 s4, 0 ; CHECK-NEXT: s_cselect_b32 s4, 1, 0 -; CHECK-NEXT: s_xor_b32 s4, s4, -1 ; CHECK-NEXT: s_and_b32 s4, s4, 1 ; CHECK-NEXT: s_cmp_lg_u32 s4, 0 -; CHECK-NEXT: s_cbranch_scc0 BB4_6 +; CHECK-NEXT: s_cbranch_scc1 BB4_6 ; CHECK-NEXT: ; %bb.1: ; %bb2 -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, const.ptr@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, const.ptr@gotpcrel32@hi+4 -; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 1 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, const.ptr@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, const.ptr@gotpcrel32@hi+4 +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: s_mov_b32 s4, -1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s4 -; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s7 ; CHECK-NEXT: flat_load_dword v0, v[0:1] -; CHECK-NEXT: s_mov_b32 s4, -1 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, 1.0, v0 -; CHECK-NEXT: s_xor_b64 s[8:9], vcc, s[6:7] -; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] +; CHECK-NEXT: v_cmp_ngt_f32_e32 vcc, 1.0, v0 +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc ; CHECK-NEXT: ; %bb.2: ; %bb7 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: ; %bb.3: ; %bb8 @@ -217,10 +214,8 @@ ; CHECK-NEXT: ; %bb.2: ; %bb4 ; CHECK-NEXT: ; in Loop: Header=BB5_1 Depth=1 ; CHECK-NEXT: global_load_dword v2, v[0:1], off -; CHECK-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, 1 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_lt_i32_e32 vcc, v0, v2 -; CHECK-NEXT: s_xor_b64 s[2:3], vcc, s[2:3] +; CHECK-NEXT: v_cmp_ge_i32_e64 s[2:3], v0, v2 ; CHECK-NEXT: BB5_3: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB5_1 Depth=1 ; CHECK-NEXT: s_and_b64 s[2:3], exec, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -10,12 +10,10 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_or_b32_e32 v5, v1, v3 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] -; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 -; CHECK-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz BB0_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_ashrrev_i32_e32 v4, 31, v3 @@ -204,10 +202,7 @@ ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: s_mov_b32 s1, -1 ; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], s[0:1] -; CHECK-NEXT: v_cmp_eq_u64_e64 s[6:7], s[6:7], 0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 1 -; CHECK-NEXT: s_xor_b64 vcc, s[6:7], s[8:9] -; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 +; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[6:7], 0 ; CHECK-NEXT: s_cbranch_vccz BB1_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: s_ashr_i32 s6, s3, 31 @@ -358,11 +353,14 @@ ; CHECK-NEXT: v_xor_b32_e32 v0, s0, v0 ; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 ; CHECK-NEXT: s_mov_b32 s1, 0 -; CHECK-NEXT: BB1_2: ; %Flow +; CHECK-NEXT: s_branch BB1_3 +; CHECK-NEXT: BB1_2: +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 +; CHECK-NEXT: BB1_3: ; %Flow ; CHECK-NEXT: s_and_b32 s0, s1, 1 ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 -; CHECK-NEXT: s_cbranch_scc0 BB1_4 -; CHECK-NEXT: ; %bb.3: +; CHECK-NEXT: s_cbranch_scc0 BB1_5 +; CHECK-NEXT: ; %bb.4: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, s4 ; CHECK-NEXT: s_sub_i32 s0, 0, s4 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -382,7 +380,7 @@ ; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: BB1_4: +; CHECK-NEXT: BB1_5: ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 ; CHECK-NEXT: s_mov_b32 s1, s0 ; CHECK-NEXT: ; return to shader part epilog @@ -695,12 +693,10 @@ ; CGP-NEXT: v_mov_b32_e32 v8, v0 ; CGP-NEXT: v_or_b32_e32 v1, v9, v5 ; CGP-NEXT: v_mov_b32_e32 v0, 0 -; CGP-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 -; CGP-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CGP-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB2_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v5 @@ -874,12 +870,10 @@ ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: v_or_b32_e32 v5, v3, v7 ; CGP-NEXT: v_mov_b32_e32 v4, 0 -; CGP-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 -; CGP-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; CGP-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CGP-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB2_6 ; CGP-NEXT: ; %bb.5: ; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v7 @@ -2513,12 +2507,10 @@ ; CHECK-NEXT: v_lshl_b64 v[4:5], s[4:5], v2 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_or_b32_e32 v3, v1, v5 -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 -; CHECK-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 -; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz BB7_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v5 @@ -3002,13 +2994,11 @@ ; CGP-NEXT: v_mov_b32_e32 v5, v0 ; CGP-NEXT: v_or_b32_e32 v1, v7, v11 ; CGP-NEXT: v_mov_b32_e32 v0, 0 -; CGP-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: v_lshl_b64 v[8:9], s[4:5], v6 -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 -; CGP-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CGP-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB8_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v11 @@ -3182,12 +3172,10 @@ ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: v_or_b32_e32 v5, v3, v9 ; CGP-NEXT: v_mov_b32_e32 v4, 0 -; CGP-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 -; CGP-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; CGP-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CGP-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB8_6 ; CGP-NEXT: ; %bb.5: ; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v9 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -10,12 +10,10 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_or_b32_e32 v5, v1, v3 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] -; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 -; CHECK-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz BB0_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_ashrrev_i32_e32 v4, 31, v3 @@ -200,10 +198,7 @@ ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: s_mov_b32 s1, -1 ; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], s[0:1] -; CHECK-NEXT: v_cmp_eq_u64_e64 s[6:7], s[6:7], 0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 1 -; CHECK-NEXT: s_xor_b64 vcc, s[6:7], s[8:9] -; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 +; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[6:7], 0 ; CHECK-NEXT: s_cbranch_vccz BB1_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: s_ashr_i32 s0, s5, 31 @@ -352,11 +347,14 @@ ; CHECK-NEXT: v_xor_b32_e32 v0, s6, v0 ; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0 ; CHECK-NEXT: s_mov_b32 s1, 0 -; CHECK-NEXT: BB1_2: ; %Flow +; CHECK-NEXT: s_branch BB1_3 +; CHECK-NEXT: BB1_2: +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 +; CHECK-NEXT: BB1_3: ; %Flow ; CHECK-NEXT: s_and_b32 s0, s1, 1 ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 -; CHECK-NEXT: s_cbranch_scc0 BB1_4 -; CHECK-NEXT: ; %bb.3: +; CHECK-NEXT: s_cbranch_scc0 BB1_5 +; CHECK-NEXT: ; %bb.4: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, s4 ; CHECK-NEXT: s_sub_i32 s0, 0, s4 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -374,7 +372,7 @@ ; CHECK-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: BB1_4: +; CHECK-NEXT: BB1_5: ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 ; CHECK-NEXT: s_mov_b32 s1, s0 ; CHECK-NEXT: ; return to shader part epilog @@ -683,12 +681,10 @@ ; CGP-NEXT: v_mov_b32_e32 v8, v0 ; CGP-NEXT: v_or_b32_e32 v1, v9, v5 ; CGP-NEXT: v_mov_b32_e32 v0, 0 -; CGP-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 -; CGP-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CGP-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB2_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v5 @@ -858,12 +854,10 @@ ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: v_or_b32_e32 v5, v3, v7 ; CGP-NEXT: v_mov_b32_e32 v4, 0 -; CGP-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 -; CGP-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; CGP-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CGP-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB2_6 ; CGP-NEXT: ; %bb.5: ; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v7 @@ -2477,12 +2471,10 @@ ; CHECK-NEXT: v_lshl_b64 v[4:5], s[4:5], v2 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_or_b32_e32 v3, v1, v5 -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 -; CHECK-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 -; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz BB7_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v5 @@ -2958,13 +2950,11 @@ ; CGP-NEXT: v_mov_b32_e32 v5, v0 ; CGP-NEXT: v_or_b32_e32 v1, v7, v11 ; CGP-NEXT: v_mov_b32_e32 v0, 0 -; CGP-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: v_lshl_b64 v[8:9], s[4:5], v6 -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 -; CGP-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CGP-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB8_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v11 @@ -3134,12 +3124,10 @@ ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: v_or_b32_e32 v5, v3, v9 ; CGP-NEXT: v_mov_b32_e32 v4, 0 -; CGP-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 -; CGP-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; CGP-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CGP-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB8_6 ; CGP-NEXT: ; %bb.5: ; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v9 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -10,12 +10,10 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_or_b32_e32 v5, v1, v3 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] -; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 -; CHECK-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz BB0_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_cvt_f32_u32_e32 v4, v2 @@ -189,10 +187,7 @@ ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: s_mov_b32 s5, -1 ; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], s[4:5] -; CHECK-NEXT: v_cmp_eq_u64_e64 s[6:7], s[6:7], 0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 1 -; CHECK-NEXT: s_xor_b64 vcc, s[6:7], s[8:9] -; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 +; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[6:7], 0 ; CHECK-NEXT: s_cbranch_vccz BB1_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, s2 @@ -324,11 +319,14 @@ ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CHECK-NEXT: s_mov_b32 s5, 0 -; CHECK-NEXT: BB1_2: ; %Flow +; CHECK-NEXT: s_branch BB1_3 +; CHECK-NEXT: BB1_2: +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 +; CHECK-NEXT: BB1_3: ; %Flow ; CHECK-NEXT: s_and_b32 s1, s5, 1 ; CHECK-NEXT: s_cmp_lg_u32 s1, 0 -; CHECK-NEXT: s_cbranch_scc0 BB1_4 -; CHECK-NEXT: ; %bb.3: +; CHECK-NEXT: s_cbranch_scc0 BB1_5 +; CHECK-NEXT: ; %bb.4: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, s2 ; CHECK-NEXT: s_sub_i32 s1, 0, s2 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -348,7 +346,7 @@ ; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: BB1_4: +; CHECK-NEXT: BB1_5: ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 ; CHECK-NEXT: s_mov_b32 s1, s0 ; CHECK-NEXT: ; return to shader part epilog @@ -631,12 +629,10 @@ ; CGP-NEXT: v_mov_b32_e32 v9, v1 ; CGP-NEXT: v_or_b32_e32 v1, v9, v5 ; CGP-NEXT: v_mov_b32_e32 v0, 0 -; CGP-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 -; CGP-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CGP-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB2_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v4 @@ -795,12 +791,10 @@ ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: v_or_b32_e32 v5, v3, v7 ; CGP-NEXT: v_mov_b32_e32 v4, 0 -; CGP-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 -; CGP-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; CGP-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CGP-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB2_6 ; CGP-NEXT: ; %bb.5: ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v6 @@ -2292,15 +2286,13 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_movk_i32 s4, 0x1000 ; CHECK-NEXT: s_mov_b32 s5, 0 -; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: v_lshl_b64 v[4:5], s[4:5], v2 -; CHECK-NEXT: v_or_b32_e32 v7, v1, v5 -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 -; CHECK-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; CHECK-NEXT: v_or_b32_e32 v3, v1, v5 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 -; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz BB7_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v4 @@ -2736,16 +2728,14 @@ ; CGP-NEXT: v_mov_b32_e32 v7, v1 ; CGP-NEXT: s_movk_i32 s4, 0x1000 ; CGP-NEXT: s_mov_b32 s5, 0 -; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v4 ; CGP-NEXT: v_lshl_b64 v[8:9], s[4:5], v6 ; CGP-NEXT: v_or_b32_e32 v1, v7, v11 -; CGP-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 -; CGP-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; CGP-NEXT: v_mov_b32_e32 v0, 0 +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CGP-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB8_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v10 @@ -2904,12 +2894,10 @@ ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: v_or_b32_e32 v5, v3, v9 ; CGP-NEXT: v_mov_b32_e32 v4, 0 -; CGP-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 -; CGP-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; CGP-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CGP-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB8_6 ; CGP-NEXT: ; %bb.5: ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v8 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -10,12 +10,10 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_or_b32_e32 v5, v1, v3 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] -; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 -; CHECK-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz BB0_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_cvt_f32_u32_e32 v4, v2 @@ -186,10 +184,7 @@ ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: s_mov_b32 s5, -1 ; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], s[4:5] -; CHECK-NEXT: v_cmp_eq_u64_e64 s[6:7], s[6:7], 0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 1 -; CHECK-NEXT: s_xor_b64 vcc, s[6:7], s[8:9] -; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 +; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[6:7], 0 ; CHECK-NEXT: s_cbranch_vccz BB1_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, s2 @@ -320,11 +315,14 @@ ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; CHECK-NEXT: s_mov_b32 s5, 0 -; CHECK-NEXT: BB1_2: ; %Flow +; CHECK-NEXT: s_branch BB1_3 +; CHECK-NEXT: BB1_2: +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 +; CHECK-NEXT: BB1_3: ; %Flow ; CHECK-NEXT: s_and_b32 s1, s5, 1 ; CHECK-NEXT: s_cmp_lg_u32 s1, 0 -; CHECK-NEXT: s_cbranch_scc0 BB1_4 -; CHECK-NEXT: ; %bb.3: +; CHECK-NEXT: s_cbranch_scc0 BB1_5 +; CHECK-NEXT: ; %bb.4: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, s2 ; CHECK-NEXT: s_sub_i32 s1, 0, s2 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -342,7 +340,7 @@ ; CHECK-NEXT: v_subrev_i32_e32 v1, vcc, s2, v0 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: BB1_4: +; CHECK-NEXT: BB1_5: ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 ; CHECK-NEXT: s_mov_b32 s1, s0 ; CHECK-NEXT: ; return to shader part epilog @@ -623,12 +621,10 @@ ; CGP-NEXT: v_mov_b32_e32 v9, v1 ; CGP-NEXT: v_or_b32_e32 v1, v9, v5 ; CGP-NEXT: v_mov_b32_e32 v0, 0 -; CGP-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 -; CGP-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CGP-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB2_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v4 @@ -784,12 +780,10 @@ ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: v_or_b32_e32 v5, v3, v7 ; CGP-NEXT: v_mov_b32_e32 v4, 0 -; CGP-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 -; CGP-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; CGP-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CGP-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB2_6 ; CGP-NEXT: ; %bb.5: ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v6 @@ -2258,15 +2252,13 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_movk_i32 s4, 0x1000 ; CHECK-NEXT: s_mov_b32 s5, 0 -; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: v_lshl_b64 v[4:5], s[4:5], v2 -; CHECK-NEXT: v_or_b32_e32 v7, v1, v5 -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 -; CHECK-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; CHECK-NEXT: v_or_b32_e32 v3, v1, v5 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 -; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz BB7_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v4 @@ -2697,16 +2689,14 @@ ; CGP-NEXT: v_mov_b32_e32 v7, v1 ; CGP-NEXT: s_movk_i32 s4, 0x1000 ; CGP-NEXT: s_mov_b32 s5, 0 -; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v4 ; CGP-NEXT: v_lshl_b64 v[8:9], s[4:5], v6 ; CGP-NEXT: v_or_b32_e32 v1, v7, v11 -; CGP-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 -; CGP-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; CGP-NEXT: v_mov_b32_e32 v0, 0 +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CGP-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB8_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v10 @@ -2862,12 +2852,10 @@ ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: v_or_b32_e32 v5, v3, v9 ; CGP-NEXT: v_mov_b32_e32 v4, 0 -; CGP-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 -; CGP-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; CGP-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CGP-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB8_6 ; CGP-NEXT: ; %bb.5: ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v8