diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -147,9 +147,10 @@ bool matchSextInRegOfLoad(MachineInstr &MI, std::tuple &MatchInfo); bool applySextInRegOfLoad(MachineInstr &MI, std::tuple &MatchInfo); - bool matchElideBrByInvertingCond(MachineInstr &MI); - void applyElideBrByInvertingCond(MachineInstr &MI); - bool tryElideBrByInvertingCond(MachineInstr &MI); + /// If a brcond's true block is not the fallthrough, make it so by inverting + /// the condition and swapping operands. + bool matchOptBrCondByInvertingCond(MachineInstr &MI); + void applyOptBrCondByInvertingCond(MachineInstr &MI); /// If \p MI is G_CONCAT_VECTORS, try to combine it. /// Returns true if MI changed. diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h --- a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h @@ -241,5 +241,9 @@ /// Returns true if given the TargetLowering's boolean contents information, /// the value \p Val contains a true value. bool isConstTrueVal(const TargetLowering &TLI, int64_t Val); + +/// Returns an integer representing true, as defined by the +/// TargetBooleanContents, for a given scalar type \p Ty. +APInt getICmpTrueVal(const TargetLowering &TLI, LLT Ty); } // End namespace llvm. #endif diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -145,13 +145,11 @@ [{ return Helper.matchCombineIndexedLoadStore(*${root}, ${matchinfo}); }]), (apply [{ Helper.applyCombineIndexedLoadStore(*${root}, ${matchinfo}); }])>; -// FIXME: Is there a reason this wasn't in tryCombine? I've left it out of -// all_combines because it wasn't there. -def elide_br_by_inverting_cond : GICombineRule< +def opt_brcond_by_inverting_cond : GICombineRule< (defs root:$root), (match (wip_match_opcode G_BR):$root, - [{ return Helper.matchElideBrByInvertingCond(*${root}); }]), - (apply [{ Helper.applyElideBrByInvertingCond(*${root}); }])>; + [{ return Helper.matchOptBrCondByInvertingCond(*${root}); }]), + (apply [{ Helper.applyOptBrCondByInvertingCond(*${root}); }])>; def ptr_add_immed_matchdata : GIDefMatchData<"PtrAddChain">; def ptr_add_immed_chain : GICombineRule< @@ -355,4 +353,6 @@ identity_combines, simplify_add_to_sub, hoist_logic_op_with_same_opcode_hands, shl_ashr_to_sext_inreg, sext_inreg_of_load, - width_reduction_combines, not_cmp_fold]>; + width_reduction_combines, not_cmp_fold, + opt_brcond_by_inverting_cond]>; + diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -876,14 +876,12 @@ LLVM_DEBUG(dbgs() << " Combinined to indexed operation"); } -bool CombinerHelper::matchElideBrByInvertingCond(MachineInstr &MI) { +bool CombinerHelper::matchOptBrCondByInvertingCond(MachineInstr &MI) { if (MI.getOpcode() != TargetOpcode::G_BR) return false; // Try to match the following: // bb1: - // %c(s32) = G_ICMP pred, %a, %b - // %c1(s1) = G_TRUNC %c(s32) // G_BRCOND %c1, %bb2 // G_BR %bb3 // bb2: @@ -893,7 +891,7 @@ // The above pattern does not have a fall through to the successor bb2, always // resulting in a branch no matter which path is taken. Here we try to find // and replace that pattern with conditional branch to bb3 and otherwise - // fallthrough to bb2. + // fallthrough to bb2. This is generally better for branch predictors. MachineBasicBlock *MBB = MI.getParent(); MachineBasicBlock::iterator BrIt(MI); @@ -908,40 +906,32 @@ // Check that the next block is the conditional branch target. if (!MBB->isLayoutSuccessor(BrCond->getOperand(1).getMBB())) return false; - - MachineInstr *CmpMI = MRI.getVRegDef(BrCond->getOperand(0).getReg()); - if (!CmpMI || CmpMI->getOpcode() != TargetOpcode::G_ICMP || - !MRI.hasOneNonDBGUse(CmpMI->getOperand(0).getReg())) - return false; - return true; -} - -bool CombinerHelper::tryElideBrByInvertingCond(MachineInstr &MI) { - if (!matchElideBrByInvertingCond(MI)) - return false; - applyElideBrByInvertingCond(MI); return true; } -void CombinerHelper::applyElideBrByInvertingCond(MachineInstr &MI) { +void CombinerHelper::applyOptBrCondByInvertingCond(MachineInstr &MI) { MachineBasicBlock *BrTarget = MI.getOperand(0).getMBB(); MachineBasicBlock::iterator BrIt(MI); MachineInstr *BrCond = &*std::prev(BrIt); - MachineInstr *CmpMI = MRI.getVRegDef(BrCond->getOperand(0).getReg()); + auto &MF = Builder.getMF(); + const auto &TLI = *MF.getSubtarget().getTargetLowering(); - CmpInst::Predicate InversePred = CmpInst::getInversePredicate( - (CmpInst::Predicate)CmpMI->getOperand(1).getPredicate()); + Builder.setInstrAndDebugLoc(*BrCond); + LLT Ty = MRI.getType(BrCond->getOperand(0).getReg()); + auto True = Builder.buildConstant(Ty, getICmpTrueVal(TLI, Ty)); + auto Xor = Builder.buildXor(Ty, BrCond->getOperand(0), True); - // Invert the G_ICMP condition. - Observer.changingInstr(*CmpMI); - CmpMI->getOperand(1).setPredicate(InversePred); - Observer.changedInstr(*CmpMI); + auto *FallthroughBB = BrCond->getOperand(1).getMBB(); + Observer.changingInstr(MI); + MI.getOperand(0).setMBB(FallthroughBB); + Observer.changedInstr(MI); - // Change the conditional branch target. + // Change the conditional branch to use the inverted condition and + // new target block. Observer.changingInstr(*BrCond); + BrCond->getOperand(0).setReg(Xor.getReg(0)); BrCond->getOperand(1).setMBB(BrTarget); Observer.changedInstr(*BrCond); - MI.eraseFromParent(); } static bool shouldLowerMemFuncForSize(const MachineFunction &MF) { diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -11,6 +11,7 @@ #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" #include "llvm/ADT/Twine.h" #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" @@ -715,3 +716,18 @@ } llvm_unreachable("Invalid boolean contents"); } + +APInt llvm::getICmpTrueVal(const TargetLowering &TLI, LLT Ty) { + assert(Ty.isScalar() && "Expected a scalar type only"); + APInt True(Ty.getSizeInBits(), 0); + switch (TLI.getBooleanContents(false, false)) { + case TargetLowering::UndefinedBooleanContent: + case TargetLowering::ZeroOrOneBooleanContent: + True.setBit(0); + return True; + case TargetLowering::ZeroOrNegativeOneBooleanContent: + True.setAllBits(); + return True; + } + llvm_unreachable("Invalid boolean contents"); +} diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -19,7 +19,6 @@ def AArch64PreLegalizerCombinerHelper: GICombinerHelper< "AArch64GenPreLegalizerCombinerHelper", [all_combines, - elide_br_by_inverting_cond, fconstant_to_constant]> { let DisableRuleOption = "aarch64prelegalizercombiner-disable-rule"; let StateClass = "AArch64PreLegalizerCombinerHelperState"; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -42,8 +42,7 @@ def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper< - "AMDGPUGenPreLegalizerCombinerHelper", [all_combines, - elide_br_by_inverting_cond]> { + "AMDGPUGenPreLegalizerCombinerHelper", [all_combines]> { let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule"; } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/const-0.ll b/llvm/test/CodeGen/AArch64/GlobalISel/const-0.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/const-0.ll +++ /dev/null @@ -1,25 +0,0 @@ -; RUN: llc -mtriple=aarch64-linux-gnu -global-isel -O0 -o - %s | FileCheck %s - -%struct.comp = type { i8*, i32, i8*, [3 x i8], i32 } - -define void @regbranch() { -; CHECK-LABEL: regbranch: -; CHECK: mov {{w[0-9]+}}, #0 -cond_next240.i: - br i1 false, label %cond_true251.i, label %cond_next272.i - -cond_true251.i: - switch i8 0, label %cond_next272.i [ - i8 42, label %bb268.i - i8 43, label %bb268.i - i8 63, label %bb268.i - ] - -bb268.i: - br label %cond_next272.i - -cond_next272.i: - %len.2.i = phi i32 [ 0, %bb268.i ], [ 0, %cond_next240.i ], [ 0, %cond_true251.i ] - %tmp278.i = icmp eq i32 %len.2.i, 1 - ret void -} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-br.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-br.mir --- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-br.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-br.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -debugify-and-strip-all-safe -O0 -run-pass=aarch64-prelegalizer-combiner -global-isel -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -debugify-and-strip-all-safe -O0 -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombinerhelper-only-enable-rule="opt_brcond_by_inverting_cond" -global-isel -verify-machineinstrs %s -o - | FileCheck %s --- | target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" target triple = "arm64-apple-ios5.0.0" @@ -38,8 +38,11 @@ ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[COPY]](s32), [[C]] - ; CHECK: G_BRCOND [[ICMP]](s1), %bb.2 + ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY]](s32), [[C]] + ; CHECK: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true + ; CHECK: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP]], [[C2]] + ; CHECK: G_BRCOND [[XOR]](s1), %bb.2 + ; CHECK: G_BR %bb.1 ; CHECK: bb.1.if.then: ; CHECK: successors: %bb.3(0x80000000) ; CHECK: [[ADD:%[0-9]+]]:_(s32) = nsw G_ADD [[COPY1]], [[COPY]] diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-constant.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-constant.mir --- a/llvm/test/CodeGen/AArch64/GlobalISel/select-constant.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-constant.mir @@ -8,6 +8,8 @@ define i16 @const_s16() { ret i16 42 } define i32 @const_s32() { ret i32 42 } define i64 @const_s64() { ret i64 1234567890123 } + define i32 @const_s32_zero() { ret i32 0 } + define i64 @const_s64_zero() { ret i64 0 } define i8* @const_p0_0() { ret i8* null } define i32 @fconst_s32() { ret i32 42 } @@ -81,6 +83,38 @@ $x0 = COPY %0(s64) ... +--- +name: const_s32_zero +legalized: true +regBankSelected: true +registers: + - { id: 0, class: gpr } + +body: | + bb.0: + ; CHECK-LABEL: name: const_s32_zero + ; CHECK: [[COPY:%[0-9]+]]:gpr32 = COPY $wzr + ; CHECK: $w0 = COPY [[COPY]] + %0(s32) = G_CONSTANT i32 0 + $w0 = COPY %0(s32) +... + +--- +name: const_s64_zero +legalized: true +regBankSelected: true +registers: + - { id: 0, class: gpr } + +body: | + bb.0: + ; CHECK-LABEL: name: const_s64_zero + ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $xzr + ; CHECK: $x0 = COPY [[COPY]] + %0(s64) = G_CONSTANT i64 0 + $x0 = COPY %0(s64) +... + --- name: const_p0_0 legalized: true diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll @@ -134,11 +134,11 @@ ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_load_dword s4, s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_cmp_lg_u32 s4, 0 +; CHECK-NEXT: s_cmp_eq_u32 s4, 0 ; CHECK-NEXT: s_cselect_b32 s4, 1, 0 ; CHECK-NEXT: s_and_b32 s4, s4, 1 ; CHECK-NEXT: s_cmp_lg_u32 s4, 0 -; CHECK-NEXT: s_cbranch_scc1 BB4_6 +; CHECK-NEXT: s_cbranch_scc0 BB4_6 ; CHECK-NEXT: ; %bb.1: ; %bb2 ; CHECK-NEXT: s_getpc_b64 s[6:7] ; CHECK-NEXT: s_add_u32 s6, s6, const.ptr@gotpcrel32@lo+4 @@ -208,9 +208,9 @@ ; CHECK-NEXT: BB5_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_u32_e32 v1, 1, v1 -; CHECK-NEXT: v_cmp_le_i32_e32 vcc, 0, v1 +; CHECK-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 ; CHECK-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, 1 -; CHECK-NEXT: s_cbranch_vccnz BB5_3 +; CHECK-NEXT: s_cbranch_vccz BB5_3 ; CHECK-NEXT: ; %bb.2: ; %bb4 ; CHECK-NEXT: ; in Loop: Header=BB5_1 Depth=1 ; CHECK-NEXT: global_load_dword v2, v[0:1], off diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll @@ -9,11 +9,11 @@ ; GCN-NEXT: s_load_dword s0, s[4:5], 0x24 ; GCN-NEXT: ; implicit-def: $vcc_hi ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cmp_eq_u32 s1, 0 ; GCN-NEXT: s_cselect_b32 s1, 1, 0 ; GCN-NEXT: s_and_b32 s1, s1, 1 ; GCN-NEXT: s_cmp_lg_u32 s1, 0 -; GCN-NEXT: s_cbranch_scc1 BB0_2 +; GCN-NEXT: s_cbranch_scc0 BB0_2 ; GCN-NEXT: ; %bb.1: ; %mid ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll @@ -7,11 +7,11 @@ ; GCN-NEXT: s_load_dword s2, s[4:5], 0x0 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s2, 0 +; GCN-NEXT: s_cmp_eq_u32 s2, 0 ; GCN-NEXT: s_cselect_b32 s2, 1, 0 ; GCN-NEXT: s_and_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s2, 0 -; GCN-NEXT: s_cbranch_scc1 BB0_2 +; GCN-NEXT: s_cbranch_scc0 BB0_2 ; GCN-NEXT: ; %bb.1: ; %mid ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -20,20 +20,20 @@ ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_movk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_cmp_eq_u32 s6, 0 ; GCN-NEXT: s_cselect_b32 s6, 1, 0 ; GCN-NEXT: s_and_b32 s6, s6, 1 ; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_mov_b32 s33, 0 -; GCN-NEXT: s_cbranch_scc1 BB0_3 +; GCN-NEXT: s_cbranch_scc0 BB0_3 ; GCN-NEXT: ; %bb.1: ; %bb.0 ; GCN-NEXT: s_load_dword s6, s[4:5], 0xc ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_cmp_eq_u32 s6, 0 ; GCN-NEXT: s_cselect_b32 s6, 1, 0 ; GCN-NEXT: s_and_b32 s6, s6, 1 ; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_cbranch_scc1 BB0_3 +; GCN-NEXT: s_cbranch_scc0 BB0_3 ; GCN-NEXT: ; %bb.2: ; %bb.1 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GCN-NEXT: s_load_dword s4, s[4:5], 0x10 @@ -102,12 +102,12 @@ ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_movk_i32 s32, 0x1000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_cmp_eq_u32 s6, 0 ; GCN-NEXT: s_cselect_b32 s6, 1, 0 ; GCN-NEXT: s_and_b32 s6, s6, 1 ; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_mov_b32 s33, 0 -; GCN-NEXT: s_cbranch_scc1 BB1_2 +; GCN-NEXT: s_cbranch_scc0 BB1_2 ; GCN-NEXT: ; %bb.1: ; %bb.0 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GCN-NEXT: s_load_dword s4, s[4:5], 0xc