diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -204,9 +204,6 @@ void initializeSILowerControlFlowPass(PassRegistry &); extern char &SILowerControlFlowID; -void initializeSIRemoveShortExecBranchesPass(PassRegistry &); -extern char &SIRemoveShortExecBranchesID; - void initializeSIPreEmitPeepholePass(PassRegistry &); extern char &SIPreEmitPeepholeID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -249,7 +249,6 @@ initializeSIModeRegisterPass(*PR); initializeSIWholeQuadModePass(*PR); initializeSILowerControlFlowPass(*PR); - initializeSIRemoveShortExecBranchesPass(*PR); initializeSIPreEmitPeepholePass(*PR); initializeSIInsertSkipsPass(*PR); initializeSIMemoryLegalizerPass(*PR); @@ -1215,7 +1214,6 @@ if (getOptLevel() > CodeGenOpt::None) addPass(&SIInsertHardClausesID); - addPass(&SIRemoveShortExecBranchesID); addPass(&SIInsertSkipsPassID); addPass(&SIPreEmitPeepholeID); // The hazard recognizer that runs as part of the post-ra scheduler does not diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -137,7 +137,6 @@ SIPreEmitPeephole.cpp SIProgramInfo.cpp SIRegisterInfo.cpp - SIRemoveShortExecBranches.cpp SIShrinkInstructions.cpp SIWholeQuadMode.cpp GCNILPSched.cpp diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -21,6 +21,14 @@ #define DEBUG_TYPE "si-pre-emit-peephole" +static unsigned SkipThreshold; + +static cl::opt SkipThresholdFlag( + "amdgpu-skip-threshold", cl::Hidden, + cl::desc( + "Number of instructions before jumping over divergent control flow"), + cl::location(SkipThreshold), cl::init(12)); + namespace { class SIPreEmitPeephole : public MachineFunctionPass { @@ -30,6 +38,13 @@ bool optimizeVccBranch(MachineInstr &MI) const; bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const; + bool getBlockDestinations(MachineBasicBlock &SrcMBB, + MachineBasicBlock *&TrueMBB, + MachineBasicBlock *&FalseMBB, + SmallVectorImpl &Cond); + bool mustRetainExeczBranch(const MachineBasicBlock &From, + const MachineBasicBlock &To) const; + bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB); public: static char ID; @@ -258,6 +273,74 @@ return true; } +bool SIPreEmitPeephole::getBlockDestinations( + MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB, + MachineBasicBlock *&FalseMBB, SmallVectorImpl &Cond) { + if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond)) + return false; + + if (!FalseMBB) + FalseMBB = SrcMBB.getNextNode(); + + return true; +} + +bool SIPreEmitPeephole::mustRetainExeczBranch( + const MachineBasicBlock &From, const MachineBasicBlock &To) const { + unsigned NumInstr = 0; + const MachineFunction *MF = From.getParent(); + + for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end(); + MBBI != End && MBBI != ToI; ++MBBI) { + const MachineBasicBlock &MBB = *MBBI; + + for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + // When a uniform loop is inside non-uniform control flow, the branch + // leaving the loop might never be taken when EXEC = 0. + // Hence we should retain cbranch out of the loop lest it become infinite. + if (I->isConditionalBranch()) + return true; + + if (TII->hasUnwantedEffectsWhenEXECEmpty(*I)) + return true; + + // These instructions are potentially expensive even if EXEC = 0. + if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) || + TII->isDS(*I) || I->getOpcode() == AMDGPU::S_WAITCNT) + return true; + + ++NumInstr; + if (NumInstr >= SkipThreshold) + return true; + } + } + + return false; +} + +// Returns true if the skip branch instruction is removed. +bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI, + MachineBasicBlock &SrcMBB) { + MachineBasicBlock *TrueMBB = nullptr; + MachineBasicBlock *FalseMBB = nullptr; + SmallVector Cond; + + if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond)) + return false; + + // Consider only the forward branches. + if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) || + mustRetainExeczBranch(*FalseMBB, *TrueMBB)) + return false; + + LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI); + MI.eraseFromParent(); + SrcMBB.removeSuccessor(TrueMBB); + + return true; +} + bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); @@ -265,10 +348,12 @@ MachineBasicBlock *EmptyMBBAtEnd = nullptr; bool Changed = false; + MF.RenumberBlocks(); + for (MachineBasicBlock &MBB : MF) { MachineBasicBlock::iterator MBBE = MBB.getFirstTerminator(); MachineBasicBlock::iterator TermI = MBBE; - // Check first terminator for VCC branches to optimize + // Check first terminator for branches to optimize if (TermI != MBB.end()) { MachineInstr &MI = *TermI; switch (MI.getOpcode()) { @@ -276,6 +361,9 @@ case AMDGPU::S_CBRANCH_VCCNZ: Changed |= optimizeVccBranch(MI); continue; + case AMDGPU::S_CBRANCH_EXECZ: + Changed |= removeExeczBranch(MI, MBB); + continue; default: break; } diff --git a/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp b/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp deleted file mode 100644 --- a/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp +++ /dev/null @@ -1,159 +0,0 @@ -//===-- SIRemoveShortExecBranches.cpp ------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file -/// This pass optmizes the s_cbranch_execz instructions. -/// The pass removes this skip instruction for short branches, -/// if there is no unwanted sideeffect in the fallthrough code sequence. -/// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "GCNSubtarget.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/Support/CommandLine.h" - -using namespace llvm; - -#define DEBUG_TYPE "si-remove-short-exec-branches" - -static unsigned SkipThreshold; - -static cl::opt SkipThresholdFlag( - "amdgpu-skip-threshold", cl::Hidden, - cl::desc( - "Number of instructions before jumping over divergent control flow"), - cl::location(SkipThreshold), cl::init(12)); - -namespace { - -class SIRemoveShortExecBranches : public MachineFunctionPass { -private: - const SIInstrInfo *TII = nullptr; - bool getBlockDestinations(MachineBasicBlock &SrcMBB, - MachineBasicBlock *&TrueMBB, - MachineBasicBlock *&FalseMBB, - SmallVectorImpl &Cond); - bool mustRetainExeczBranch(const MachineBasicBlock &From, - const MachineBasicBlock &To) const; - bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB); - -public: - static char ID; - - SIRemoveShortExecBranches() : MachineFunctionPass(ID) { - initializeSIRemoveShortExecBranchesPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; -}; - -} // End anonymous namespace. - -INITIALIZE_PASS(SIRemoveShortExecBranches, DEBUG_TYPE, - "SI remove short exec branches", false, false) - -char SIRemoveShortExecBranches::ID = 0; - -char &llvm::SIRemoveShortExecBranchesID = SIRemoveShortExecBranches::ID; - -bool SIRemoveShortExecBranches::getBlockDestinations( - MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB, - MachineBasicBlock *&FalseMBB, SmallVectorImpl &Cond) { - if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond)) - return false; - - if (!FalseMBB) - FalseMBB = SrcMBB.getNextNode(); - - return true; -} - -bool SIRemoveShortExecBranches::mustRetainExeczBranch( - const MachineBasicBlock &From, const MachineBasicBlock &To) const { - unsigned NumInstr = 0; - const MachineFunction *MF = From.getParent(); - - for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end(); - MBBI != End && MBBI != ToI; ++MBBI) { - const MachineBasicBlock &MBB = *MBBI; - - for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) { - // When a uniform loop is inside non-uniform control flow, the branch - // leaving the loop might never be taken when EXEC = 0. - // Hence we should retain cbranch out of the loop lest it become infinite. - if (I->isConditionalBranch()) - return true; - - if (TII->hasUnwantedEffectsWhenEXECEmpty(*I)) - return true; - - if (TII->isKillTerminator(I->getOpcode())) - return true; - - // These instructions are potentially expensive even if EXEC = 0. - if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) || - TII->isDS(*I) || I->getOpcode() == AMDGPU::S_WAITCNT) - return true; - - ++NumInstr; - if (NumInstr >= SkipThreshold) - return true; - } - } - - return false; -} - -// Returns true if the skip branch instruction is removed. -bool SIRemoveShortExecBranches::removeExeczBranch(MachineInstr &MI, - MachineBasicBlock &SrcMBB) { - MachineBasicBlock *TrueMBB = nullptr; - MachineBasicBlock *FalseMBB = nullptr; - SmallVector Cond; - - if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond)) - return false; - - // Consider only the forward branches. - if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) || - mustRetainExeczBranch(*FalseMBB, *TrueMBB)) - return false; - - LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI); - MI.eraseFromParent(); - SrcMBB.removeSuccessor(TrueMBB); - - return true; -} - -bool SIRemoveShortExecBranches::runOnMachineFunction(MachineFunction &MF) { - const GCNSubtarget &ST = MF.getSubtarget(); - TII = ST.getInstrInfo(); - MF.RenumberBlocks(); - bool Changed = false; - - for (MachineBasicBlock &MBB : MF) { - MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); - if (MBBI == MBB.end()) - continue; - - MachineInstr &MI = *MBBI; - switch (MI.getOpcode()) { - case AMDGPU::S_CBRANCH_EXECZ: - Changed = removeExeczBranch(MI, MBB); - break; - default: - break; - } - } - - return Changed; -} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll @@ -166,12 +166,13 @@ ; SI-NEXT: s_xor_b64 s[2:3], vcc, -1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] ; SI-NEXT: s_xor_b64 s[2:3], exec, s[4:5] +; SI-NEXT: s_cbranch_execz BB2_3 ; SI-NEXT: ; %bb.1: ; %.demote ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 BB2_4 ; SI-NEXT: ; %bb.2: ; %.demote ; SI-NEXT: s_mov_b64 exec, 0 -; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: BB2_3: ; %.continue ; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm @@ -192,12 +193,13 @@ ; GFX9-NEXT: s_xor_b64 s[2:3], vcc, -1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz BB2_3 ; GFX9-NEXT: ; %bb.1: ; %.demote ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 BB2_4 ; GFX9-NEXT: ; %bb.2: ; %.demote ; GFX9-NEXT: s_mov_b64 exec, 0 -; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: BB2_3: ; %.continue ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm @@ -218,12 +220,13 @@ ; GFX10-32-NEXT: s_xor_b32 s1, vcc_lo, -1 ; GFX10-32-NEXT: s_and_saveexec_b32 s2, s1 ; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s2 +; GFX10-32-NEXT: s_cbranch_execz BB2_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB2_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 -; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: BB2_3: ; %.continue ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo ; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm @@ -244,12 +247,13 @@ ; GFX10-64-NEXT: s_xor_b64 s[2:3], vcc, -1 ; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] ; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[4:5] +; GFX10-64-NEXT: s_cbranch_execz BB2_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 BB2_4 ; GFX10-64-NEXT: ; %bb.2: ; %.demote ; GFX10-64-NEXT: s_mov_b64 exec, 0 -; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: BB2_3: ; %.continue ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm @@ -284,13 +288,14 @@ ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 ; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc ; SI-NEXT: s_xor_b64 s[14:15], exec, s[14:15] +; SI-NEXT: s_cbranch_execz BB3_3 ; SI-NEXT: ; %bb.1: ; %.demote ; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; SI-NEXT: s_cbranch_scc0 BB3_4 ; SI-NEXT: ; %bb.2: ; %.demote ; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] ; SI-NEXT: s_and_b64 exec, exec, s[16:17] -; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: BB3_3: ; %.continue ; SI-NEXT: s_or_b64 exec, exec, s[14:15] ; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf ; SI-NEXT: s_waitcnt vmcnt(0) @@ -312,13 +317,14 @@ ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 ; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc ; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[14:15] +; GFX9-NEXT: s_cbranch_execz BB3_3 ; GFX9-NEXT: ; %bb.1: ; %.demote ; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX9-NEXT: s_cbranch_scc0 BB3_4 ; GFX9-NEXT: ; %bb.2: ; %.demote ; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] -; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: BB3_3: ; %.continue ; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -340,13 +346,14 @@ ; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v1 ; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo ; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s13 +; GFX10-32-NEXT: s_cbranch_execz BB3_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote ; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB3_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote ; GFX10-32-NEXT: s_wqm_b32 s28, s12 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28 -; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: BB3_3: ; %.continue ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 ; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-32-NEXT: s_waitcnt vmcnt(0) @@ -368,13 +375,14 @@ ; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 ; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc ; GFX10-64-NEXT: s_xor_b64 s[28:29], exec, s[14:15] +; GFX10-64-NEXT: s_cbranch_execz BB3_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote ; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX10-64-NEXT: s_cbranch_scc0 BB3_4 ; GFX10-64-NEXT: ; %bb.2: ; %.demote ; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] -; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: BB3_3: ; %.continue ; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29] ; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-64-NEXT: s_waitcnt vmcnt(0) @@ -416,13 +424,14 @@ ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 ; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc ; SI-NEXT: s_xor_b64 s[14:15], exec, s[14:15] +; SI-NEXT: s_cbranch_execz BB4_3 ; SI-NEXT: ; %bb.1: ; %.demote ; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; SI-NEXT: s_cbranch_scc0 BB4_4 ; SI-NEXT: ; %bb.2: ; %.demote ; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] ; SI-NEXT: s_and_b64 exec, exec, s[16:17] -; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: BB4_3: ; %.continue ; SI-NEXT: s_or_b64 exec, exec, s[14:15] ; SI-NEXT: v_add_f32_e32 v0, v0, v0 ; SI-NEXT: s_and_b64 exec, exec, s[12:13] @@ -444,13 +453,14 @@ ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc ; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[14:15] +; GFX9-NEXT: s_cbranch_execz BB4_3 ; GFX9-NEXT: ; %bb.1: ; %.demote ; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX9-NEXT: s_cbranch_scc0 BB4_4 ; GFX9-NEXT: ; %bb.2: ; %.demote ; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] -; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: BB4_3: ; %.continue ; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] @@ -472,13 +482,14 @@ ; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v0 ; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo ; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s13 +; GFX10-32-NEXT: s_cbranch_execz BB4_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote ; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB4_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote ; GFX10-32-NEXT: s_wqm_b32 s28, s12 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28 -; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: BB4_3: ; %.continue ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 ; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 @@ -500,13 +511,14 @@ ; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 ; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc ; GFX10-64-NEXT: s_xor_b64 s[28:29], exec, s[14:15] +; GFX10-64-NEXT: s_cbranch_execz BB4_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote ; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX10-64-NEXT: s_cbranch_scc0 BB4_4 ; GFX10-64-NEXT: ; %bb.2: ; %.demote ; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] -; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: BB4_3: ; %.continue ; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29] ; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] @@ -660,13 +672,14 @@ ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz BB6_3 ; SI-NEXT: ; %bb.1: ; %.demote0 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 BB6_7 ; SI-NEXT: ; %bb.2: ; %.demote0 ; SI-NEXT: s_wqm_b64 s[6:7], s[0:1] ; SI-NEXT: s_and_b64 exec, exec, s[6:7] -; SI-NEXT: ; %bb.3: ; %.continue0 +; SI-NEXT: BB6_3: ; %.continue0 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_mov_b64 s[4:5], s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[4:5] @@ -682,12 +695,13 @@ ; SI-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; SI-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SI-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; SI-NEXT: s_cbranch_execz BB6_6 ; SI-NEXT: ; %bb.4: ; %.demote1 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 BB6_7 ; SI-NEXT: ; %bb.5: ; %.demote1 ; SI-NEXT: s_mov_b64 exec, 0 -; SI-NEXT: ; %bb.6: ; %.continue1 +; SI-NEXT: BB6_6: ; %.continue1 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: v_mov_b32_e32 v1, s3 @@ -706,13 +720,14 @@ ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz BB6_3 ; GFX9-NEXT: ; %bb.1: ; %.demote0 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 BB6_7 ; GFX9-NEXT: ; %bb.2: ; %.demote0 ; GFX9-NEXT: s_wqm_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_and_b64 exec, exec, s[4:5] -; GFX9-NEXT: ; %bb.3: ; %.continue0 +; GFX9-NEXT: BB6_3: ; %.continue0 ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] @@ -728,12 +743,13 @@ ; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], -1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz BB6_6 ; GFX9-NEXT: ; %bb.4: ; %.demote1 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 BB6_7 ; GFX9-NEXT: ; %bb.5: ; %.demote1 ; GFX9-NEXT: s_mov_b64 exec, 0 -; GFX9-NEXT: ; %bb.6: ; %.continue1 +; GFX9-NEXT: BB6_6: ; %.continue1 ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 @@ -752,13 +768,14 @@ ; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-32-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX10-32-NEXT: s_cbranch_execz BB6_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote0 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB6_7 ; GFX10-32-NEXT: ; %bb.2: ; %.demote0 ; GFX10-32-NEXT: s_wqm_b32 s2, s0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s2 -; GFX10-32-NEXT: ; %bb.3: ; %.continue0 +; GFX10-32-NEXT: BB6_3: ; %.continue0 ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: s_mov_b32 s1, s0 ; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s1 @@ -772,12 +789,13 @@ ; GFX10-32-NEXT: s_xor_b32 s1, s1, -1 ; GFX10-32-NEXT: s_and_saveexec_b32 s2, s1 ; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s2 +; GFX10-32-NEXT: s_cbranch_execz BB6_6 ; GFX10-32-NEXT: ; %bb.4: ; %.demote1 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB6_7 ; GFX10-32-NEXT: ; %bb.5: ; %.demote1 ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 -; GFX10-32-NEXT: ; %bb.6: ; %.continue1 +; GFX10-32-NEXT: BB6_6: ; %.continue1 ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 @@ -796,13 +814,14 @@ ; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10-64-NEXT: s_cbranch_execz BB6_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote0 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 BB6_7 ; GFX10-64-NEXT: ; %bb.2: ; %.demote0 ; GFX10-64-NEXT: s_wqm_b64 s[4:5], s[0:1] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[4:5] -; GFX10-64-NEXT: ; %bb.3: ; %.continue0 +; GFX10-64-NEXT: BB6_3: ; %.continue0 ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] @@ -816,12 +835,13 @@ ; GFX10-64-NEXT: s_xor_b64 s[2:3], s[2:3], -1 ; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] ; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[4:5] +; GFX10-64-NEXT: s_cbranch_execz BB6_6 ; GFX10-64-NEXT: ; %bb.4: ; %.demote1 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 BB6_7 ; GFX10-64-NEXT: ; %bb.5: ; %.demote1 ; GFX10-64-NEXT: s_mov_b64 exec, 0 -; GFX10-64-NEXT: ; %bb.6: ; %.continue1 +; GFX10-64-NEXT: BB6_6: ; %.continue1 ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 @@ -883,13 +903,14 @@ ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz BB7_3 ; SI-NEXT: ; %bb.1: ; %.demote0 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 BB7_9 ; SI-NEXT: ; %bb.2: ; %.demote0 ; SI-NEXT: s_wqm_b64 s[8:9], s[0:1] ; SI-NEXT: s_and_b64 exec, exec, s[8:9] -; SI-NEXT: ; %bb.3: ; %.continue0.preheader +; SI-NEXT: BB7_3: ; %.continue0.preheader ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: v_mov_b32_e32 v0, s6 @@ -948,13 +969,14 @@ ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz BB7_3 ; GFX9-NEXT: ; %bb.1: ; %.demote0 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 BB7_9 ; GFX9-NEXT: ; %bb.2: ; %.demote0 ; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] -; GFX9-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX9-NEXT: BB7_3: ; %.continue0.preheader ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -1013,13 +1035,14 @@ ; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10-32-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX10-32-NEXT: s_cbranch_execz BB7_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote0 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB7_9 ; GFX10-32-NEXT: ; %bb.2: ; %.demote0 ; GFX10-32-NEXT: s_wqm_b32 s3, s0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 -; GFX10-32-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX10-32-NEXT: BB7_3: ; %.continue0.preheader ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-32-NEXT: v_mov_b32_e32 v0, s1 ; GFX10-32-NEXT: s_branch BB7_5 @@ -1075,13 +1098,14 @@ ; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX10-64-NEXT: s_cbranch_execz BB7_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote0 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 BB7_9 ; GFX10-64-NEXT: ; %bb.2: ; %.demote0 ; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] -; GFX10-64-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX10-64-NEXT: BB7_3: ; %.continue0.preheader ; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10-64-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-64-NEXT: s_mov_b64 s[2:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem-ds.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem-ds.mir --- a/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem-ds.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem-ds.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=polaris10 -run-pass si-remove-short-exec-branches -amdgpu-skip-threshold=1 -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=polaris10 -run-pass si-pre-emit-peephole -amdgpu-skip-threshold=1 -verify-machineinstrs %s -o - | FileCheck %s --- diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir --- a/llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-remove-short-exec-branches -amdgpu-skip-threshold=1 -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-pre-emit-peephole -amdgpu-skip-threshold=1 -verify-machineinstrs %s -o - | FileCheck %s # Make sure mandatory skips are inserted to ensure GWS ops aren't run with exec = 0 --- diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-ignored-insts.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-ignored-insts.mir --- a/llvm/test/CodeGen/AMDGPU/insert-skips-ignored-insts.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-skips-ignored-insts.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass si-remove-short-exec-branches -amdgpu-skip-threshold=3 %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass si-pre-emit-peephole -amdgpu-skip-threshold=3 %s -o - | FileCheck %s --- diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll @@ -167,12 +167,13 @@ ; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; SI-NEXT: s_xor_b64 s[0:1], exec, s[4:5] +; SI-NEXT: s_cbranch_execz BB2_3 ; SI-NEXT: ; %bb.1: ; %.demote ; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], exec ; SI-NEXT: s_cbranch_scc0 BB2_4 ; SI-NEXT: ; %bb.2: ; %.demote ; SI-NEXT: s_mov_b64 exec, 0 -; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: BB2_3: ; %.continue ; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm @@ -194,12 +195,13 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz BB2_3 ; GFX9-NEXT: ; %bb.1: ; %.demote ; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], exec ; GFX9-NEXT: s_cbranch_scc0 BB2_4 ; GFX9-NEXT: ; %bb.2: ; %.demote ; GFX9-NEXT: s_mov_b64 exec, 0 -; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: BB2_3: ; %.continue ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm @@ -221,12 +223,13 @@ ; GFX10-32-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 ; GFX10-32-NEXT: s_and_saveexec_b32 s2, s0 ; GFX10-32-NEXT: s_xor_b32 s0, exec_lo, s2 +; GFX10-32-NEXT: s_cbranch_execz BB2_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote ; GFX10-32-NEXT: s_andn2_b32 s1, s1, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB2_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 -; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: BB2_3: ; %.continue ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo ; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm @@ -248,12 +251,13 @@ ; GFX10-64-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 ; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX10-64-NEXT: s_xor_b64 s[0:1], exec, s[4:5] +; GFX10-64-NEXT: s_cbranch_execz BB2_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote ; GFX10-64-NEXT: s_andn2_b64 s[2:3], s[2:3], exec ; GFX10-64-NEXT: s_cbranch_scc0 BB2_4 ; GFX10-64-NEXT: ; %bb.2: ; %.demote ; GFX10-64-NEXT: s_mov_b64 exec, 0 -; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: BB2_3: ; %.continue ; GFX10-64-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm @@ -289,13 +293,14 @@ ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 ; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc ; SI-NEXT: s_xor_b64 s[14:15], exec, s[14:15] +; SI-NEXT: s_cbranch_execz BB3_3 ; SI-NEXT: ; %bb.1: ; %.demote ; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; SI-NEXT: s_cbranch_scc0 BB3_4 ; SI-NEXT: ; %bb.2: ; %.demote ; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] ; SI-NEXT: s_and_b64 exec, exec, s[16:17] -; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: BB3_3: ; %.continue ; SI-NEXT: s_or_b64 exec, exec, s[14:15] ; SI-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -317,13 +322,14 @@ ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 ; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc ; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[14:15] +; GFX9-NEXT: s_cbranch_execz BB3_3 ; GFX9-NEXT: ; %bb.1: ; %.demote ; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX9-NEXT: s_cbranch_scc0 BB3_4 ; GFX9-NEXT: ; %bb.2: ; %.demote ; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] -; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: BB3_3: ; %.continue ; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -345,13 +351,14 @@ ; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v1 ; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo ; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s13 +; GFX10-32-NEXT: s_cbranch_execz BB3_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote ; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB3_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote ; GFX10-32-NEXT: s_wqm_b32 s28, s12 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28 -; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: BB3_3: ; %.continue ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 ; GFX10-32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-32-NEXT: s_waitcnt vmcnt(0) @@ -373,13 +380,14 @@ ; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 ; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc ; GFX10-64-NEXT: s_xor_b64 s[28:29], exec, s[14:15] +; GFX10-64-NEXT: s_cbranch_execz BB3_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote ; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX10-64-NEXT: s_cbranch_scc0 BB3_4 ; GFX10-64-NEXT: ; %bb.2: ; %.demote ; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] -; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: BB3_3: ; %.continue ; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29] ; GFX10-64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-64-NEXT: s_waitcnt vmcnt(0) @@ -421,13 +429,14 @@ ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 ; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc ; SI-NEXT: s_xor_b64 s[14:15], exec, s[14:15] +; SI-NEXT: s_cbranch_execz BB4_3 ; SI-NEXT: ; %bb.1: ; %.demote ; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; SI-NEXT: s_cbranch_scc0 BB4_4 ; SI-NEXT: ; %bb.2: ; %.demote ; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] ; SI-NEXT: s_and_b64 exec, exec, s[16:17] -; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: BB4_3: ; %.continue ; SI-NEXT: s_or_b64 exec, exec, s[14:15] ; SI-NEXT: v_add_f32_e32 v0, v0, v0 ; SI-NEXT: s_and_b64 exec, exec, s[12:13] @@ -449,13 +458,14 @@ ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc ; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[14:15] +; GFX9-NEXT: s_cbranch_execz BB4_3 ; GFX9-NEXT: ; %bb.1: ; %.demote ; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX9-NEXT: s_cbranch_scc0 BB4_4 ; GFX9-NEXT: ; %bb.2: ; %.demote ; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] -; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: BB4_3: ; %.continue ; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] @@ -477,13 +487,14 @@ ; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v0 ; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo ; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s13 +; GFX10-32-NEXT: s_cbranch_execz BB4_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote ; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB4_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote ; GFX10-32-NEXT: s_wqm_b32 s28, s12 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28 -; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: BB4_3: ; %.continue ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 ; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 @@ -505,13 +516,14 @@ ; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 ; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc ; GFX10-64-NEXT: s_xor_b64 s[28:29], exec, s[14:15] +; GFX10-64-NEXT: s_cbranch_execz BB4_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote ; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX10-64-NEXT: s_cbranch_scc0 BB4_4 ; GFX10-64-NEXT: ; %bb.2: ; %.demote ; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] -; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: BB4_3: ; %.continue ; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29] ; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] @@ -659,13 +671,14 @@ ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc ; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; SI-NEXT: s_cbranch_execz BB6_3 ; SI-NEXT: ; %bb.1: ; %.demote0 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 BB6_7 ; SI-NEXT: ; %bb.2: ; %.demote0 ; SI-NEXT: s_wqm_b64 s[4:5], s[0:1] ; SI-NEXT: s_and_b64 exec, exec, s[4:5] -; SI-NEXT: ; %bb.3: ; %.continue0 +; SI-NEXT: BB6_3: ; %.continue0 ; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_mov_b64 s[2:3], s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] @@ -681,12 +694,13 @@ ; SI-NEXT: s_or_b64 s[2:3], s[2:3], vcc ; SI-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] ; SI-NEXT: s_xor_b64 s[2:3], exec, s[4:5] +; SI-NEXT: s_cbranch_execz BB6_6 ; SI-NEXT: ; %bb.4: ; %.demote1 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 BB6_7 ; SI-NEXT: ; %bb.5: ; %.demote1 ; SI-NEXT: s_mov_b64 exec, 0 -; SI-NEXT: ; %bb.6: ; %.continue1 +; SI-NEXT: BB6_6: ; %.continue1 ; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: v_bfrev_b32_e32 v0, 60 ; SI-NEXT: v_mov_b32_e32 v1, 0x3c00 @@ -705,13 +719,14 @@ ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz BB6_3 ; GFX9-NEXT: ; %bb.1: ; %.demote0 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 BB6_7 ; GFX9-NEXT: ; %bb.2: ; %.demote0 ; GFX9-NEXT: s_wqm_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_and_b64 exec, exec, s[4:5] -; GFX9-NEXT: ; %bb.3: ; %.continue0 +; GFX9-NEXT: BB6_3: ; %.continue0 ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] @@ -727,12 +742,13 @@ ; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], vcc ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz BB6_6 ; GFX9-NEXT: ; %bb.4: ; %.demote1 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 BB6_7 ; GFX9-NEXT: ; %bb.5: ; %.demote1 ; GFX9-NEXT: s_mov_b64 exec, 0 -; GFX9-NEXT: ; %bb.6: ; %.continue1 +; GFX9-NEXT: BB6_6: ; %.continue1 ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 @@ -751,13 +767,14 @@ ; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-32-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX10-32-NEXT: s_cbranch_execz BB6_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote0 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB6_7 ; GFX10-32-NEXT: ; %bb.2: ; %.demote0 ; GFX10-32-NEXT: s_wqm_b32 s2, s0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s2 -; GFX10-32-NEXT: ; %bb.3: ; %.continue0 +; GFX10-32-NEXT: BB6_3: ; %.continue0 ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: s_mov_b32 s1, s0 ; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s1 @@ -771,12 +788,13 @@ ; GFX10-32-NEXT: s_or_b32 s1, s1, vcc_lo ; GFX10-32-NEXT: s_and_saveexec_b32 s2, s1 ; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s2 +; GFX10-32-NEXT: s_cbranch_execz BB6_6 ; GFX10-32-NEXT: ; %bb.4: ; %.demote1 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB6_7 ; GFX10-32-NEXT: ; %bb.5: ; %.demote1 ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 -; GFX10-32-NEXT: ; %bb.6: ; %.continue1 +; GFX10-32-NEXT: BB6_6: ; %.continue1 ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 @@ -795,13 +813,14 @@ ; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10-64-NEXT: s_cbranch_execz BB6_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote0 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 BB6_7 ; GFX10-64-NEXT: ; %bb.2: ; %.demote0 ; GFX10-64-NEXT: s_wqm_b64 s[4:5], s[0:1] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[4:5] -; GFX10-64-NEXT: ; %bb.3: ; %.continue0 +; GFX10-64-NEXT: BB6_3: ; %.continue0 ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] @@ -815,12 +834,13 @@ ; GFX10-64-NEXT: s_or_b64 s[2:3], s[2:3], vcc ; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] ; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[4:5] +; GFX10-64-NEXT: s_cbranch_execz BB6_6 ; GFX10-64-NEXT: ; %bb.4: ; %.demote1 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 BB6_7 ; GFX10-64-NEXT: ; %bb.5: ; %.demote1 ; GFX10-64-NEXT: s_mov_b64 exec, 0 -; GFX10-64-NEXT: ; %bb.6: ; %.continue1 +; GFX10-64-NEXT: BB6_6: ; %.continue1 ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 @@ -875,13 +895,14 @@ ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz BB7_3 ; SI-NEXT: ; %bb.1: ; %.demote0 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 BB7_9 ; SI-NEXT: ; %bb.2: ; %.demote0 ; SI-NEXT: s_wqm_b64 s[6:7], s[0:1] ; SI-NEXT: s_and_b64 exec, exec, s[6:7] -; SI-NEXT: ; %bb.3: ; %.continue0.preheader +; SI-NEXT: BB7_3: ; %.continue0.preheader ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch BB7_5 @@ -940,13 +961,14 @@ ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz BB7_3 ; GFX9-NEXT: ; %bb.1: ; %.demote0 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 BB7_9 ; GFX9-NEXT: ; %bb.2: ; %.demote0 ; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] -; GFX9-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX9-NEXT: BB7_3: ; %.continue0.preheader ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch BB7_5 @@ -1005,13 +1027,14 @@ ; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10-32-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX10-32-NEXT: s_cbranch_execz BB7_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote0 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB7_9 ; GFX10-32-NEXT: ; %bb.2: ; %.demote0 ; GFX10-32-NEXT: s_wqm_b32 s3, s0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 -; GFX10-32-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX10-32-NEXT: BB7_3: ; %.continue0.preheader ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-32-NEXT: s_mov_b32 s2, 0 ; GFX10-32-NEXT: s_branch BB7_5 @@ -1067,13 +1090,14 @@ ; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX10-64-NEXT: s_cbranch_execz BB7_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote0 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 BB7_9 ; GFX10-64-NEXT: ; %bb.2: ; %.demote0 ; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] -; GFX10-64-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX10-64-NEXT: BB7_3: ; %.continue0.preheader ; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10-64-NEXT: s_mov_b64 s[4:5], 0 ; GFX10-64-NEXT: s_branch BB7_5 diff --git a/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-gpr-idx-mode.mir b/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-gpr-idx-mode.mir --- a/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-gpr-idx-mode.mir +++ b/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-gpr-idx-mode.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=si-remove-short-exec-branches -amdgpu-skip-threshold=10 -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=si-pre-emit-peephole -amdgpu-skip-threshold=10 -verify-machineinstrs %s -o - | FileCheck %s # Make sure mandatory skips are not removed around mode defs. # FIXME: -amdgpu-skip-threshold seems to be backwards. diff --git a/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir b/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir --- a/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir +++ b/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=si-remove-short-exec-branches -amdgpu-skip-threshold=10 -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=si-pre-emit-peephole -amdgpu-skip-threshold=10 -verify-machineinstrs %s -o - | FileCheck %s # Make sure mandatory skips are not removed around mode defs. # FIXME: -amdgpu-skip-threshold seems to be backwards. diff --git a/llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir b/llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir --- a/llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir +++ b/llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=si-remove-short-exec-branches -amdgpu-skip-threshold=1000000 -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=si-pre-emit-peephole -amdgpu-skip-threshold=1000000 -o - %s | FileCheck %s --- name: skip_branch_taildup_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -1002,13 +1002,14 @@ ; SI-NEXT: v_cmp_nle_f32_e32 vcc, 0, v1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz BB13_3 ; SI-NEXT: ; %bb.1: ; %bb3 ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 ; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc ; SI-NEXT: s_cbranch_scc0 BB13_6 ; SI-NEXT: ; %bb.2: ; %bb3 ; SI-NEXT: s_andn2_b64 exec, exec, vcc -; SI-NEXT: ; %bb.3: ; %bb4 +; SI-NEXT: BB13_3: ; %bb4 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_mov_b32 s1, s0 ; SI-NEXT: s_mov_b32 s2, s0 @@ -1043,13 +1044,14 @@ ; GFX10-WAVE64-NEXT: s_mov_b32 s0, 0 ; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX10-WAVE64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX10-WAVE64-NEXT: s_cbranch_execz BB13_3 ; GFX10-WAVE64-NEXT: ; %bb.1: ; %bb3 ; GFX10-WAVE64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 ; GFX10-WAVE64-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc ; GFX10-WAVE64-NEXT: s_cbranch_scc0 BB13_6 ; GFX10-WAVE64-NEXT: ; %bb.2: ; %bb3 ; GFX10-WAVE64-NEXT: s_andn2_b64 exec, exec, vcc -; GFX10-WAVE64-NEXT: ; %bb.3: ; %bb4 +; GFX10-WAVE64-NEXT: BB13_3: ; %bb4 ; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10-WAVE64-NEXT: s_mov_b32 s1, s0 ; GFX10-WAVE64-NEXT: s_mov_b32 s2, s0 @@ -1082,13 +1084,14 @@ ; GFX10-WAVE32-NEXT: s_mov_b32 s0, 0 ; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10-WAVE32-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX10-WAVE32-NEXT: s_cbranch_execz BB13_3 ; GFX10-WAVE32-NEXT: ; %bb.1: ; %bb3 ; GFX10-WAVE32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v0 ; GFX10-WAVE32-NEXT: s_andn2_b32 s1, s1, vcc_lo ; GFX10-WAVE32-NEXT: s_cbranch_scc0 BB13_6 ; GFX10-WAVE32-NEXT: ; %bb.2: ; %bb3 ; GFX10-WAVE32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo -; GFX10-WAVE32-NEXT: ; %bb.3: ; %bb4 +; GFX10-WAVE32-NEXT: BB13_3: ; %bb4 ; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-WAVE32-NEXT: s_mov_b32 s1, s0 ; GFX10-WAVE32-NEXT: s_mov_b32 s2, s0 @@ -1154,12 +1157,13 @@ ; SI-NEXT: v_cmp_ge_f32_e32 vcc, 0, v1 ; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc ; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; SI-NEXT: s_cbranch_execz BB14_3 ; SI-NEXT: ; %bb.1: ; %kill ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 BB14_6 ; SI-NEXT: ; %bb.2: ; %kill ; SI-NEXT: s_mov_b64 exec, 0 -; SI-NEXT: ; %bb.3: ; %Flow +; SI-NEXT: BB14_3: ; %Flow ; SI-NEXT: s_or_saveexec_b64 s[0:1], s[2:3] ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_xor_b64 exec, exec, s[0:1] @@ -1190,12 +1194,13 @@ ; GFX10-WAVE64-NEXT: v_cmp_ge_f32_e32 vcc, 0, v1 ; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10-WAVE64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10-WAVE64-NEXT: s_cbranch_execz BB14_3 ; GFX10-WAVE64-NEXT: ; %bb.1: ; %kill ; GFX10-WAVE64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-WAVE64-NEXT: s_cbranch_scc0 BB14_6 ; GFX10-WAVE64-NEXT: ; %bb.2: ; %kill ; GFX10-WAVE64-NEXT: s_mov_b64 exec, 0 -; GFX10-WAVE64-NEXT: ; %bb.3: ; %Flow +; GFX10-WAVE64-NEXT: BB14_3: ; %Flow ; GFX10-WAVE64-NEXT: s_or_saveexec_b64 s[0:1], s[2:3] ; GFX10-WAVE64-NEXT: ; implicit-def: $vgpr2 ; GFX10-WAVE64-NEXT: s_xor_b64 exec, exec, s[0:1] @@ -1226,12 +1231,13 @@ ; GFX10-WAVE32-NEXT: v_cmp_ge_f32_e32 vcc_lo, 0, v1 ; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10-WAVE32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX10-WAVE32-NEXT: s_cbranch_execz BB14_3 ; GFX10-WAVE32-NEXT: ; %bb.1: ; %kill ; GFX10-WAVE32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-WAVE32-NEXT: s_cbranch_scc0 BB14_6 ; GFX10-WAVE32-NEXT: ; %bb.2: ; %kill ; GFX10-WAVE32-NEXT: s_mov_b32 exec_lo, 0 -; GFX10-WAVE32-NEXT: ; %bb.3: ; %Flow +; GFX10-WAVE32-NEXT: BB14_3: ; %Flow ; GFX10-WAVE32-NEXT: s_or_saveexec_b32 s0, s1 ; GFX10-WAVE32-NEXT: ; implicit-def: $vgpr2 ; GFX10-WAVE32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll --- a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll +++ b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll @@ -108,25 +108,26 @@ ; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5 ; GCN: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GCN: bb.4.Flow1: - ; GCN: successors: %bb.5(0x40000000) + ; GCN: successors: %bb.5(0x40000000), %bb.7(0x40000000) ; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 ; GCN: renamable $sgpr2_sgpr3 = S_OR_SAVEEXEC_B64 killed renamable $sgpr2_sgpr3, implicit-def $exec, implicit-def $scc, implicit $exec ; GCN: $exec = S_XOR_B64 $exec, renamable $sgpr2_sgpr3, implicit-def $scc + ; GCN: S_CBRANCH_EXECZ %bb.7, implicit $exec ; GCN: bb.5.kill0: - ; GCN: successors: %bb.8(0x40000000), %bb.7(0x40000000) + ; GCN: successors: %bb.6(0x40000000), %bb.8(0x40000000) ; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 ; GCN: dead renamable $sgpr0_sgpr1 = S_ANDN2_B64 killed renamable $sgpr0_sgpr1, $exec, implicit-def $scc - ; GCN: S_CBRANCH_SCC0 %bb.7, implicit $scc - ; GCN: bb.8.kill0: - ; GCN: successors: %bb.6(0x80000000) + ; GCN: S_CBRANCH_SCC0 %bb.8, implicit $scc + ; GCN: bb.6.kill0: + ; GCN: successors: %bb.7(0x80000000) ; GCN: liveins: $sgpr2_sgpr3, $scc ; GCN: $exec = S_MOV_B64 0 - ; GCN: bb.6.end: + ; GCN: bb.7.end: ; GCN: successors: %bb.9(0x80000000) ; GCN: liveins: $sgpr2_sgpr3 ; GCN: $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc ; GCN: S_BRANCH %bb.9 - ; GCN: bb.7: + ; GCN: bb.8: ; GCN: $exec = S_MOV_B64 0 ; GCN: EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec ; GCN: S_ENDPGM 0 diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn --- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn @@ -222,7 +222,6 @@ "SIPreEmitPeephole.cpp", "SIProgramInfo.cpp", "SIRegisterInfo.cpp", - "SIRemoveShortExecBranches.cpp", "SIShrinkInstructions.cpp", "SIWholeQuadMode.cpp", ]