Index: lib/CodeGen/BranchRelaxation.cpp =================================================================== --- lib/CodeGen/BranchRelaxation.cpp +++ lib/CodeGen/BranchRelaxation.cpp @@ -11,6 +11,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include "llvm/Support/Debug.h" @@ -23,6 +24,7 @@ STATISTIC(NumSplit, "Number of basic blocks split"); STATISTIC(NumConditionalRelaxed, "Number of conditional branches relaxed"); +STATISTIC(NumUnconditionalRelaxed, "Number of unconditional branches relaxed"); #define BRANCH_RELAX_NAME "Branch relaxation pass" @@ -57,17 +59,22 @@ }; SmallVector BlockInfo; + std::unique_ptr RS; MachineFunction *MF; const TargetInstrInfo *TII; bool relaxBranchInstructions(); void scanFunction(); + + MachineBasicBlock *createNewBlockAfter(MachineBasicBlock &BB); + MachineBasicBlock *splitBlockBeforeInstr(MachineInstr &MI); void adjustBlockOffsets(MachineBasicBlock &MBB); bool isBlockInRange(const MachineInstr &MI, const MachineBasicBlock &BB) const; bool fixupConditionalBranch(MachineInstr &MI); + bool fixupUnconditionalBranch(MachineInstr &MI); uint64_t computeBlockSize(const MachineBasicBlock &MBB) const; unsigned getInstrOffset(const MachineInstr &MI) const; void dumpBBs(); @@ -189,6 +196,19 @@ } } + /// Insert a new empty basic block and insert it after \BB +MachineBasicBlock *BranchRelaxation::createNewBlockAfter(MachineBasicBlock &BB) { + // Create a new MBB for the code after the OrigBB. + MachineBasicBlock *NewBB = + MF->CreateMachineBasicBlock(BB.getBasicBlock()); + MF->insert(++BB.getIterator(), NewBB); + + // Insert an entry into BlockInfo to align it properly with the block numbers. + BlockInfo.insert(BlockInfo.begin() + NewBB->getNumber(), BasicBlockInfo()); + + return NewBB; +} + /// Split the basic block containing MI into two blocks, which are joined by /// an unconditional branch. Update data structures and renumber blocks to /// account for this change and returns the newly created block. @@ -351,22 +371,89 @@ return true; } +bool BranchRelaxation::fixupUnconditionalBranch(MachineInstr &MI) { + MachineBasicBlock *MBB = MI.getParent(); + + unsigned OldBrSize = TII->getInstSizeInBytes(MI); + MachineBasicBlock *DestBB = TII->getBranchDestBlock(MI); + + int64_t DestOffset = BlockInfo[DestBB->getNumber()].Offset; + int64_t SrcOffset = getInstrOffset(MI); + + assert(!TII->isBranchOffsetInRange(MI.getOpcode(), DestOffset - SrcOffset)); + + BlockInfo[MBB->getNumber()].Size -= OldBrSize; + + MachineBasicBlock *BranchBB = MBB; + + // If this was an expanded conditional branch, there is already a single + // unconditional branch in a block. + if (!MBB->empty()) { + BranchBB = createNewBlockAfter(*MBB); + + // Add live outs. + for (const MachineBasicBlock *Succ : MBB->successors()) { + for (const MachineBasicBlock::RegisterMaskPair &LiveIn : Succ->liveins()) + BranchBB->addLiveIn(LiveIn); + } + + BranchBB->addSuccessor(DestBB); + MBB->replaceSuccessor(DestBB, BranchBB); + } + + DebugLoc DL = MI.getDebugLoc(); + MI.eraseFromParent(); + + // insertUnconditonalBranch may have inserted a new block. + BlockInfo[MBB->getNumber()].Size += TII->insertUnconditionalBranch( + *BranchBB, *DestBB, DL, DestOffset - SrcOffset, RS.get()); + + computeBlockSize(*BranchBB); + adjustBlockOffsets(*MBB); + return true; +} + bool BranchRelaxation::relaxBranchInstructions() { bool Changed = false; + // Relaxing branches involves creating new basic blocks, so re-eval // end() for termination. for (MachineFunction::iterator I = MF->begin(); I != MF->end(); ++I) { MachineBasicBlock &MBB = *I; - MachineBasicBlock::iterator J = MBB.getFirstTerminator(); - if (J == MBB.end()) - continue; - MachineInstr &MI = *J; - if (MI.isConditionalBranch() && - !isBlockInRange(MI, *TII->getBranchDestBlock(MI))) { - fixupConditionalBranch(MI); - ++NumConditionalRelaxed; - Changed = true; + MachineBasicBlock::iterator Next; + for (MachineBasicBlock::iterator J = MBB.getFirstTerminator(); + J != MBB.end(); J = Next) { + Next = std::next(J); + MachineInstr &MI = *J; + + if (MI.isUnconditionalBranch()) { + // Unconditional branch destination might be unanalyzable, assume these + // are OK. + if (MachineBasicBlock *DestBB = TII->getBranchDestBlock(MI)) { + if (!isBlockInRange(MI, *DestBB)) { + fixupUnconditionalBranch(MI); + ++NumUnconditionalRelaxed; + Changed = true; + } + } + + // Unconditional branch is the last terminator. + break; + } + + if (MI.isConditionalBranch()) { + if (MachineBasicBlock *DestBB = TII->getBranchDestBlock(MI)) { + if (!isBlockInRange(MI, *DestBB)) { + fixupConditionalBranch(MI); + ++NumConditionalRelaxed; + Changed = true; + + // This may have modified all of the terminators, so start over. + Next = MBB.getFirstTerminator(); + } + } + } } } @@ -378,7 +465,12 @@ DEBUG(dbgs() << "***** BranchRelaxation *****\n"); - TII = MF->getSubtarget().getInstrInfo(); + const TargetSubtargetInfo &ST = MF->getSubtarget(); + TII = ST.getInstrInfo(); + + const TargetRegisterInfo *TRI = ST.getRegisterInfo(); + if (TRI->trackLivenessAfterRegAlloc(*MF)) + RS.reset(new RegScavenger()); // Renumber all of the machine basic blocks in the function, guaranteeing that // the numbers agree with the position of the block in the function. Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.h =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -121,6 +121,9 @@ void EmitStartOfAsmFile(Module &M) override; + bool isBlockOnlyReachableByFallthrough( + const MachineBasicBlock *MBB) const override; + bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) override; Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -120,6 +120,21 @@ emitStartOfRuntimeMetadata(M); } +bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough( + const MachineBasicBlock *MBB) const { + if (!AsmPrinter::isBlockOnlyReachableByFallthrough(MBB)) + return false; + + if (MBB->empty()) + return true; + + // If this is a block implementing a long branch, an expression relative to + // the start of the block is needed. to the start of the block. + // XXX - Is there a smarter way to check this? + return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64); +} + + void AMDGPUAsmPrinter::EmitFunctionBodyStart() { const AMDGPUSubtarget &STM = MF->getSubtarget(); SIProgramInfo KernelInfo; Index: lib/Target/AMDGPU/AMDGPUMCInstLower.h =================================================================== --- lib/Target/AMDGPU/AMDGPUMCInstLower.h +++ lib/Target/AMDGPU/AMDGPUMCInstLower.h @@ -13,14 +13,20 @@ namespace llvm { class AMDGPUSubtarget; +class MachineBasicBlock; class MachineInstr; +class MachineOperand; class MCContext; +class MCExpr; class MCInst; class AMDGPUMCInstLower { MCContext &Ctx; const AMDGPUSubtarget &ST; + const MCExpr *getLongBranchBlockExpr(const MachineBasicBlock &SrcBB, + const MachineOperand &MO) const; + public: AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST); Index: lib/Target/AMDGPU/AMDGPUMCInstLower.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -46,6 +46,27 @@ } } +const MCExpr *AMDGPUMCInstLower::getLongBranchBlockExpr( + const MachineBasicBlock &SrcBB, + const MachineOperand &MO) const { + const MCExpr *DestBBSym + = MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx); + const MCExpr *SrcBBSym = MCSymbolRefExpr::create(SrcBB.getSymbol(), Ctx); + + assert(SrcBB.front().getOpcode() == AMDGPU::S_GETPC_B64 && + ST.getInstrInfo()->get(AMDGPU::S_GETPC_B64).Size == 4); + + // s_getpc_b64 returns the address of next instruction. + const MCConstantExpr *One = MCConstantExpr::create(4, Ctx); + SrcBBSym = MCBinaryExpr::createAdd(SrcBBSym, One, Ctx); + + if (MO.getTargetFlags() == AMDGPU::TF_LONG_BRANCH_FORWARD) + return MCBinaryExpr::createSub(DestBBSym, SrcBBSym, Ctx); + + assert(MO.getTargetFlags() == AMDGPU::TF_LONG_BRANCH_BACKWARD); + return MCBinaryExpr::createSub(SrcBBSym, DestBBSym, Ctx); +} + void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(MI->getOpcode()); @@ -70,8 +91,14 @@ MCOp = MCOperand::createReg(AMDGPU::getMCReg(MO.getReg(), ST)); break; case MachineOperand::MO_MachineBasicBlock: - MCOp = MCOperand::createExpr(MCSymbolRefExpr::create( - MO.getMBB()->getSymbol(), Ctx)); + if (MO.getTargetFlags() != 0) { + MCOp = MCOperand::createExpr( + getLongBranchBlockExpr(*MI->getParent(), MO)); + } else { + MCOp = MCOperand::createExpr( + MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx)); + } + break; case MachineOperand::MO_GlobalAddress: { const GlobalValue *GV = MO.getGlobal(); @@ -90,6 +117,10 @@ MCOp = MCOperand::createExpr(Expr); break; } + case MachineOperand::MO_MCSymbol: + MCOp = MCOperand::createExpr( + MCSymbolRefExpr::create(MO.getMCSymbol(), Ctx)); + break; } OutMI.addOperand(MCOp); } Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -585,6 +585,7 @@ addPass(createSIShrinkInstructionsPass()); addPass(createSILowerControlFlowPass()); addPass(createSIDebuggerInsertNopsPass()); + addPass(&BranchRelaxationPassID); } TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { Index: lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp =================================================================== --- lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp +++ lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -38,12 +38,15 @@ const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const { - // SCRATCH_RSRC_DWORD[01] is a special global variable that represents - // the scratch buffer. - if (Target.getSymA()->getSymbol().getName() == "SCRATCH_RSRC_DWORD0") - return ELF::R_AMDGPU_ABS32_LO; - if (Target.getSymA()->getSymbol().getName() == "SCRATCH_RSRC_DWORD1") - return ELF::R_AMDGPU_ABS32_HI; + if (const auto *SymA = Target.getSymA()) { + // SCRATCH_RSRC_DWORD[01] is a special global variable that represents + // the scratch buffer. + if (SymA->getSymbol().getName() == "SCRATCH_RSRC_DWORD0") + return ELF::R_AMDGPU_ABS32_LO; + + if (SymA->getSymbol().getName() == "SCRATCH_RSRC_DWORD1") + return ELF::R_AMDGPU_ABS32_HI; + } switch (Target.getAccessVariant()) { default: Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -149,6 +149,27 @@ bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override; + bool isBranchOffsetInRange(unsigned BranchOpc, + int64_t BrOffset) const override; + + MachineBasicBlock *getBranchDestBlock(const MachineInstr &MI) const override; + + void setBranchDestBlock(MachineInstr &BranchInst, + MachineBasicBlock &NewDestBB) const override; + + unsigned insertInvertedConditionalBranch( + MachineBasicBlock &SrcBB, + MachineBasicBlock::iterator InsPt, + const DebugLoc &DL, + const MachineInstr &OldBr , + MachineBasicBlock &NewDestBB) const override; + + unsigned insertUnconditionalBranch(MachineBasicBlock &MBB, + MachineBasicBlock &NewDestBB, + const DebugLoc &DL, + int64_t BrOffset = 0, + RegScavenger *RS = nullptr) const override; + bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl &Cond, @@ -580,6 +601,12 @@ const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19); const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21); const uint64_t RSRC_TID_ENABLE = UINT64_C(1) << (32 + 23); + + // For MachineOperands. + enum TargetFlags { + TF_LONG_BRANCH_FORWARD = 1 << 0, + TF_LONG_BRANCH_BACKWARD = 1 << 1 + }; } // End namespace AMDGPU namespace SI { Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -28,6 +28,13 @@ using namespace llvm; +// Must be at least 4 to be able to branch over minimum unconditional branch +// code. This is only for making it possible to write reasonably small tests for +// long branches. +static cl::opt +BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), + cl::desc("Restrict range of branch instructions (DEBUG)")); + SIInstrInfo::SIInstrInfo(const SISubtarget &ST) : AMDGPUInstrInfo(ST), RI(), ST(ST) {} @@ -1063,6 +1070,167 @@ return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); } +bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, + int64_t BrOffset) const { + // BranchRelaxation should never have to check s_setpc_b64 because its dest + // block is unanalyzable. + assert(BranchOp != AMDGPU::S_SETPC_B64); + + // Convert to dwords. + BrOffset /= 4; + + // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is + // from the next instruction. + BrOffset -= 1; + + return isIntN(BranchOffsetBits, BrOffset); +} + +MachineBasicBlock *SIInstrInfo::getBranchDestBlock( + const MachineInstr &MI) const { + if (MI.getOpcode() == AMDGPU::S_SETPC_B64) { + // This would be a difficult analysis to perform, but can always be legal so + // there's no need to analyze it. + return nullptr; + } + + return MI.getOperand(0).getMBB(); +} + +void SIInstrInfo::setBranchDestBlock(MachineInstr &MI, + MachineBasicBlock &NewDestBB) const { + return MI.getOperand(0).setMBB(&NewDestBB); +} + +static unsigned getInverseBrOpcode(unsigned Opc) { + switch (Opc) { + case AMDGPU::S_CBRANCH_SCC0: + return AMDGPU::S_CBRANCH_SCC1; + case AMDGPU::S_CBRANCH_SCC1: + return AMDGPU::S_CBRANCH_SCC0; + case AMDGPU::S_CBRANCH_VCCZ: + return AMDGPU::S_CBRANCH_VCCNZ; + case AMDGPU::S_CBRANCH_VCCNZ: + return AMDGPU::S_CBRANCH_VCCZ; + case AMDGPU::S_CBRANCH_EXECNZ: + return AMDGPU::S_CBRANCH_EXECZ; + case AMDGPU::S_CBRANCH_EXECZ: + return AMDGPU::S_CBRANCH_EXECNZ; + default: + llvm_unreachable("invalid conditional branch opcode"); + } +} + +unsigned SIInstrInfo::insertInvertedConditionalBranch( + MachineBasicBlock &SrcBB, MachineBasicBlock::iterator InsPt, + const DebugLoc &DL, const MachineInstr &OldBr, + MachineBasicBlock &DestBB) const { + BuildMI(SrcBB, InsPt, DL, get(getInverseBrOpcode(OldBr.getOpcode()))) + .addMBB(&DestBB); + return 4; +} + +unsigned SIInstrInfo::insertUnconditionalBranch(MachineBasicBlock &MBB, + MachineBasicBlock &DestBB, + const DebugLoc &DL, + int64_t BrOffset, + RegScavenger *RS) const { + if (isIntN(BranchOffsetBits, (BrOffset / 4) - 1)) { + BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) + .addMBB(&DestBB); + return 4; + } + + assert(RS && "RegScavenger required for long branching"); + assert(MBB.empty() && + "new block should be inserted for expanding unconditional branch"); + assert(MBB.pred_size() == 1); + + MachineFunction *MF = MBB.getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + // FIXME: Virtual register workaround for RegScavenger not working with empty + // blocks. + unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + + auto I = MBB.end(); + + // We need to compute the offset relative to the instruction immediately after + // s_getpc_b64. Insert pc arithmetic code before last terminator. + MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); + + // TODO: Handle > 32-bit block address. + if (BrOffset >= 0) { + BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) + .addReg(PCReg, RegState::Define, AMDGPU::sub0) + .addReg(PCReg, 0, AMDGPU::sub0) + .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD); + BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) + .addReg(PCReg, RegState::Define, AMDGPU::sub1) + .addReg(PCReg, 0, AMDGPU::sub1) + .addImm(0); + } else { + // Backwards branch. + BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32)) + .addReg(PCReg, RegState::Define, AMDGPU::sub0) + .addReg(PCReg, 0, AMDGPU::sub0) + .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD); + BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32)) + .addReg(PCReg, RegState::Define, AMDGPU::sub1) + .addReg(PCReg, 0, AMDGPU::sub1) + .addImm(0); + } + + // Insert the indirect branch after the other terminator. + BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) + .addReg(PCReg); + + // FIXME: If spilling is necessary, this will fail because this scavenger has + // no emergency stack slots. It is non-trivial to spill in this situation, + // because the restore code needs to be specially placed after the + // jump. BranchRelaxation then needs to be made aware of the newly insertd + // block. + // + // If a spill is needed for the pc register pair, we need to insert a spill + // restore block right before the destination block, and insert a short branch + // into the old destination block's fallthrough predecessor. + // e.g.: + // + // s_cbranch_scc0 skip_long_branch: + // + // long_branch_bb: + // spill s[8:9] + // s_getpc_b64 + // s_add_u32 s8, s8, restore_bb + // s_addc_u32 s9, s9, 0 + // s_setpc_b64 s[8:9] + // + // skip_long_branch: + // foo; + // + // ..... + // + // dest_bb_fallthrough_predecessor: + // bar; + // s_branch dest_bb + // + // restore_bb: + // restore s[8:9] + // fallthrough dest_bb + /// + // dest_bb: + // buzz; + + RS->enterBasicBlockEnd(MBB); + unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass, + MachineBasicBlock::iterator(GetPC), 0); + MRI.replaceRegWith(PCReg, Scav); + MRI.clearVirtRegs(); + RS->setRegUsed(Scav); + + return 4 + 8 + 4 + 4; +} + unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { switch (Cond) { case SIInstrInfo::SCC_TRUE: Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -764,16 +764,18 @@ // no input, 64-bit output. multiclass SOP1_64_0 pattern> { - def "" : SOP1_Pseudo ; + let Size = 4 in { + def "" : SOP1_Pseudo ; - def _si : SOP1_Real_si { - let src0 = 0; - } + def _si : SOP1_Real_si { + let src0 = 0; + } - def _vi : SOP1_Real_vi { - let src0 = 0; + def _vi : SOP1_Real_vi { + let src0 = 0; + } } } Index: test/CodeGen/AMDGPU/branch-relax-spill.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/branch-relax-spill.ll @@ -0,0 +1,238 @@ +; RUN: not llc -march=amdgcn -verify-machineinstrs -amdgpu-s-branch-bits=4 < %s 2>&1 | FileCheck -check-prefix=FAIL %s + +; FIXME: This should be able to compile, but requires inserting an +; extra block to restore the scavenged register. + +; FAIL: LLVM ERROR: Error while trying to spill VCC from class SReg_64: Cannot scavenge register without an emergency spill slot! + +define void @spill(i32 addrspace(1)* %arg, i32 %cnd) #0 { +entry: + %sgpr0 = tail call i32 asm sideeffect "s_mov_b32 s0, 0", "={SGPR0}"() #0 + %sgpr1 = tail call i32 asm sideeffect "s_mov_b32 s1, 0", "={SGPR1}"() #0 + %sgpr2 = tail call i32 asm sideeffect "s_mov_b32 s2, 0", "={SGPR2}"() #0 + %sgpr3 = tail call i32 asm sideeffect "s_mov_b32 s3, 0", "={SGPR3}"() #0 + %sgpr4 = tail call i32 asm sideeffect "s_mov_b32 s4, 0", "={SGPR4}"() #0 + %sgpr5 = tail call i32 asm sideeffect "s_mov_b32 s5, 0", "={SGPR5}"() #0 + %sgpr6 = tail call i32 asm sideeffect "s_mov_b32 s6, 0", "={SGPR6}"() #0 + %sgpr7 = tail call i32 asm sideeffect "s_mov_b32 s7, 0", "={SGPR7}"() #0 + %sgpr8 = tail call i32 asm sideeffect "s_mov_b32 s8, 0", "={SGPR8}"() #0 + %sgpr9 = tail call i32 asm sideeffect "s_mov_b32 s9, 0", "={SGPR9}"() #0 + %sgpr10 = tail call i32 asm sideeffect "s_mov_b32 s10, 0", "={SGPR10}"() #0 + %sgpr11 = tail call i32 asm sideeffect "s_mov_b32 s11, 0", "={SGPR11}"() #0 + %sgpr12 = tail call i32 asm sideeffect "s_mov_b32 s12, 0", "={SGPR12}"() #0 + %sgpr13 = tail call i32 asm sideeffect "s_mov_b32 s13, 0", "={SGPR13}"() #0 + %sgpr14 = tail call i32 asm sideeffect "s_mov_b32 s14, 0", "={SGPR14}"() #0 + %sgpr15 = tail call i32 asm sideeffect "s_mov_b32 s15, 0", "={SGPR15}"() #0 + %sgpr16 = tail call i32 asm sideeffect "s_mov_b32 s16, 0", "={SGPR16}"() #0 + %sgpr17 = tail call i32 asm sideeffect "s_mov_b32 s17, 0", "={SGPR17}"() #0 + %sgpr18 = tail call i32 asm sideeffect "s_mov_b32 s18, 0", "={SGPR18}"() #0 + %sgpr19 = tail call i32 asm sideeffect "s_mov_b32 s19, 0", "={SGPR19}"() #0 + %sgpr20 = tail call i32 asm sideeffect "s_mov_b32 s20, 0", "={SGPR20}"() #0 + %sgpr21 = tail call i32 asm sideeffect "s_mov_b32 s21, 0", "={SGPR21}"() #0 + %sgpr22 = tail call i32 asm sideeffect "s_mov_b32 s22, 0", "={SGPR22}"() #0 + %sgpr23 = tail call i32 asm sideeffect "s_mov_b32 s23, 0", "={SGPR23}"() #0 + %sgpr24 = tail call i32 asm sideeffect "s_mov_b32 s24, 0", "={SGPR24}"() #0 + %sgpr25 = tail call i32 asm sideeffect "s_mov_b32 s25, 0", "={SGPR25}"() #0 + %sgpr26 = tail call i32 asm sideeffect "s_mov_b32 s26, 0", "={SGPR26}"() #0 + %sgpr27 = tail call i32 asm sideeffect "s_mov_b32 s27, 0", "={SGPR27}"() #0 + %sgpr28 = tail call i32 asm sideeffect "s_mov_b32 s28, 0", "={SGPR28}"() #0 + %sgpr29 = tail call i32 asm sideeffect "s_mov_b32 s29, 0", "={SGPR29}"() #0 + %sgpr30 = tail call i32 asm sideeffect "s_mov_b32 s30, 0", "={SGPR30}"() #0 + %sgpr31 = tail call i32 asm sideeffect "s_mov_b32 s31, 0", "={SGPR31}"() #0 + %sgpr32 = tail call i32 asm sideeffect "s_mov_b32 s32, 0", "={SGPR32}"() #0 + %sgpr33 = tail call i32 asm sideeffect "s_mov_b32 s33, 0", "={SGPR33}"() #0 + %sgpr34 = tail call i32 asm sideeffect "s_mov_b32 s34, 0", "={SGPR34}"() #0 + %sgpr35 = tail call i32 asm sideeffect "s_mov_b32 s35, 0", "={SGPR35}"() #0 + %sgpr36 = tail call i32 asm sideeffect "s_mov_b32 s36, 0", "={SGPR36}"() #0 + %sgpr37 = tail call i32 asm sideeffect "s_mov_b32 s37, 0", "={SGPR37}"() #0 + %sgpr38 = tail call i32 asm sideeffect "s_mov_b32 s38, 0", "={SGPR38}"() #0 + %sgpr39 = tail call i32 asm sideeffect "s_mov_b32 s39, 0", "={SGPR39}"() #0 + %sgpr40 = tail call i32 asm sideeffect "s_mov_b32 s40, 0", "={SGPR40}"() #0 + %sgpr41 = tail call i32 asm sideeffect "s_mov_b32 s41, 0", "={SGPR41}"() #0 + %sgpr42 = tail call i32 asm sideeffect "s_mov_b32 s42, 0", "={SGPR42}"() #0 + %sgpr43 = tail call i32 asm sideeffect "s_mov_b32 s43, 0", "={SGPR43}"() #0 + %sgpr44 = tail call i32 asm sideeffect "s_mov_b32 s44, 0", "={SGPR44}"() #0 + %sgpr45 = tail call i32 asm sideeffect "s_mov_b32 s45, 0", "={SGPR45}"() #0 + %sgpr46 = tail call i32 asm sideeffect "s_mov_b32 s46, 0", "={SGPR46}"() #0 + %sgpr47 = tail call i32 asm sideeffect "s_mov_b32 s47, 0", "={SGPR47}"() #0 + %sgpr48 = tail call i32 asm sideeffect "s_mov_b32 s48, 0", "={SGPR48}"() #0 + %sgpr49 = tail call i32 asm sideeffect "s_mov_b32 s49, 0", "={SGPR49}"() #0 + %sgpr50 = tail call i32 asm sideeffect "s_mov_b32 s50, 0", "={SGPR50}"() #0 + %sgpr51 = tail call i32 asm sideeffect "s_mov_b32 s51, 0", "={SGPR51}"() #0 + %sgpr52 = tail call i32 asm sideeffect "s_mov_b32 s52, 0", "={SGPR52}"() #0 + %sgpr53 = tail call i32 asm sideeffect "s_mov_b32 s53, 0", "={SGPR53}"() #0 + %sgpr54 = tail call i32 asm sideeffect "s_mov_b32 s54, 0", "={SGPR54}"() #0 + %sgpr55 = tail call i32 asm sideeffect "s_mov_b32 s55, 0", "={SGPR55}"() #0 + %sgpr56 = tail call i32 asm sideeffect "s_mov_b32 s56, 0", "={SGPR56}"() #0 + %sgpr57 = tail call i32 asm sideeffect "s_mov_b32 s57, 0", "={SGPR57}"() #0 + %sgpr58 = tail call i32 asm sideeffect "s_mov_b32 s58, 0", "={SGPR58}"() #0 + %sgpr59 = tail call i32 asm sideeffect "s_mov_b32 s59, 0", "={SGPR59}"() #0 + %sgpr60 = tail call i32 asm sideeffect "s_mov_b32 s60, 0", "={SGPR60}"() #0 + %sgpr61 = tail call i32 asm sideeffect "s_mov_b32 s61, 0", "={SGPR61}"() #0 + %sgpr62 = tail call i32 asm sideeffect "s_mov_b32 s62, 0", "={SGPR62}"() #0 + %sgpr63 = tail call i32 asm sideeffect "s_mov_b32 s63, 0", "={SGPR63}"() #0 + %sgpr64 = tail call i32 asm sideeffect "s_mov_b32 s64, 0", "={SGPR64}"() #0 + %sgpr65 = tail call i32 asm sideeffect "s_mov_b32 s65, 0", "={SGPR65}"() #0 + %sgpr66 = tail call i32 asm sideeffect "s_mov_b32 s66, 0", "={SGPR66}"() #0 + %sgpr67 = tail call i32 asm sideeffect "s_mov_b32 s67, 0", "={SGPR67}"() #0 + %sgpr68 = tail call i32 asm sideeffect "s_mov_b32 s68, 0", "={SGPR68}"() #0 + %sgpr69 = tail call i32 asm sideeffect "s_mov_b32 s69, 0", "={SGPR69}"() #0 + %sgpr70 = tail call i32 asm sideeffect "s_mov_b32 s70, 0", "={SGPR70}"() #0 + %sgpr71 = tail call i32 asm sideeffect "s_mov_b32 s71, 0", "={SGPR71}"() #0 + %sgpr72 = tail call i32 asm sideeffect "s_mov_b32 s72, 0", "={SGPR72}"() #0 + %sgpr73 = tail call i32 asm sideeffect "s_mov_b32 s73, 0", "={SGPR73}"() #0 + %sgpr74 = tail call i32 asm sideeffect "s_mov_b32 s74, 0", "={SGPR74}"() #0 + %sgpr75 = tail call i32 asm sideeffect "s_mov_b32 s75, 0", "={SGPR75}"() #0 + %sgpr76 = tail call i32 asm sideeffect "s_mov_b32 s76, 0", "={SGPR76}"() #0 + %sgpr77 = tail call i32 asm sideeffect "s_mov_b32 s77, 0", "={SGPR77}"() #0 + %sgpr78 = tail call i32 asm sideeffect "s_mov_b32 s78, 0", "={SGPR78}"() #0 + %sgpr79 = tail call i32 asm sideeffect "s_mov_b32 s79, 0", "={SGPR79}"() #0 + %sgpr80 = tail call i32 asm sideeffect "s_mov_b32 s80, 0", "={SGPR80}"() #0 + %sgpr81 = tail call i32 asm sideeffect "s_mov_b32 s81, 0", "={SGPR81}"() #0 + %sgpr82 = tail call i32 asm sideeffect "s_mov_b32 s82, 0", "={SGPR82}"() #0 + %sgpr83 = tail call i32 asm sideeffect "s_mov_b32 s83, 0", "={SGPR83}"() #0 + %sgpr84 = tail call i32 asm sideeffect "s_mov_b32 s84, 0", "={SGPR84}"() #0 + %sgpr85 = tail call i32 asm sideeffect "s_mov_b32 s85, 0", "={SGPR85}"() #0 + %sgpr86 = tail call i32 asm sideeffect "s_mov_b32 s86, 0", "={SGPR86}"() #0 + %sgpr87 = tail call i32 asm sideeffect "s_mov_b32 s87, 0", "={SGPR87}"() #0 + %sgpr88 = tail call i32 asm sideeffect "s_mov_b32 s88, 0", "={SGPR88}"() #0 + %sgpr89 = tail call i32 asm sideeffect "s_mov_b32 s89, 0", "={SGPR89}"() #0 + %sgpr90 = tail call i32 asm sideeffect "s_mov_b32 s90, 0", "={SGPR90}"() #0 + %sgpr91 = tail call i32 asm sideeffect "s_mov_b32 s91, 0", "={SGPR91}"() #0 + %sgpr92 = tail call i32 asm sideeffect "s_mov_b32 s92, 0", "={SGPR92}"() #0 + %sgpr93 = tail call i32 asm sideeffect "s_mov_b32 s93, 0", "={SGPR93}"() #0 + %sgpr94 = tail call i32 asm sideeffect "s_mov_b32 s94, 0", "={SGPR94}"() #0 + %sgpr95 = tail call i32 asm sideeffect "s_mov_b32 s95, 0", "={SGPR95}"() #0 + %sgpr96 = tail call i32 asm sideeffect "s_mov_b32 s96, 0", "={SGPR96}"() #0 + %sgpr97 = tail call i32 asm sideeffect "s_mov_b32 s97, 0", "={SGPR97}"() #0 + %sgpr98 = tail call i32 asm sideeffect "s_mov_b32 s98, 0", "={SGPR98}"() #0 + %sgpr99 = tail call i32 asm sideeffect "s_mov_b32 s99, 0", "={SGPR99}"() #0 + %sgpr100 = tail call i32 asm sideeffect "s_mov_b32 s100, 0", "={SGPR100}"() #0 + %sgpr101 = tail call i32 asm sideeffect "s_mov_b32 s101, 0", "={SGPR101}"() #0 + %sgpr102 = tail call i32 asm sideeffect "s_mov_b32 s102, 0", "={SGPR102}"() #0 + %sgpr103 = tail call i32 asm sideeffect "s_mov_b32 s103, 0", "={SGPR103}"() #0 + %vcc_lo = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={VCC_LO}"() #0 + %vcc_hi = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={VCC_HI}"() #0 + %cmp = icmp eq i32 %cnd, 0 + br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch + +bb2: ; 28 bytes + ; 24 byte asm + call void asm sideeffect + "v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64",""() #0 + br label %bb3 + +bb3: + tail call void asm sideeffect "; reg use $0", "{SGPR0}"(i32 %sgpr0) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR1}"(i32 %sgpr1) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR2}"(i32 %sgpr2) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR3}"(i32 %sgpr3) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR4}"(i32 %sgpr4) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR5}"(i32 %sgpr5) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR6}"(i32 %sgpr6) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR7}"(i32 %sgpr7) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR8}"(i32 %sgpr8) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR9}"(i32 %sgpr9) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR10}"(i32 %sgpr10) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR11}"(i32 %sgpr11) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR12}"(i32 %sgpr12) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR13}"(i32 %sgpr13) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR14}"(i32 %sgpr14) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR15}"(i32 %sgpr15) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR16}"(i32 %sgpr16) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR17}"(i32 %sgpr17) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR18}"(i32 %sgpr18) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR19}"(i32 %sgpr19) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR20}"(i32 %sgpr20) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR21}"(i32 %sgpr21) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR22}"(i32 %sgpr22) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR23}"(i32 %sgpr23) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR24}"(i32 %sgpr24) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR25}"(i32 %sgpr25) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR26}"(i32 %sgpr26) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR27}"(i32 %sgpr27) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR28}"(i32 %sgpr28) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR29}"(i32 %sgpr29) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR30}"(i32 %sgpr30) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR31}"(i32 %sgpr31) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR32}"(i32 %sgpr32) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR33}"(i32 %sgpr33) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR34}"(i32 %sgpr34) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR35}"(i32 %sgpr35) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR36}"(i32 %sgpr36) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR37}"(i32 %sgpr37) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR38}"(i32 %sgpr38) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR39}"(i32 %sgpr39) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR40}"(i32 %sgpr40) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR41}"(i32 %sgpr41) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR42}"(i32 %sgpr42) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR43}"(i32 %sgpr43) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR44}"(i32 %sgpr44) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR45}"(i32 %sgpr45) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR46}"(i32 %sgpr46) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR47}"(i32 %sgpr47) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR48}"(i32 %sgpr48) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR49}"(i32 %sgpr49) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR50}"(i32 %sgpr50) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR51}"(i32 %sgpr51) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR52}"(i32 %sgpr52) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR53}"(i32 %sgpr53) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR54}"(i32 %sgpr54) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR55}"(i32 %sgpr55) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR56}"(i32 %sgpr56) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR57}"(i32 %sgpr57) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR58}"(i32 %sgpr58) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR59}"(i32 %sgpr59) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR60}"(i32 %sgpr60) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR61}"(i32 %sgpr61) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR62}"(i32 %sgpr62) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR63}"(i32 %sgpr63) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR64}"(i32 %sgpr64) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR65}"(i32 %sgpr65) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR66}"(i32 %sgpr66) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR67}"(i32 %sgpr67) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR68}"(i32 %sgpr68) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR69}"(i32 %sgpr69) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR70}"(i32 %sgpr70) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR71}"(i32 %sgpr71) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR72}"(i32 %sgpr72) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR73}"(i32 %sgpr73) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR74}"(i32 %sgpr74) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR75}"(i32 %sgpr75) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR76}"(i32 %sgpr76) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR77}"(i32 %sgpr77) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR78}"(i32 %sgpr78) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR79}"(i32 %sgpr79) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR80}"(i32 %sgpr80) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR81}"(i32 %sgpr81) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR82}"(i32 %sgpr82) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR83}"(i32 %sgpr83) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR84}"(i32 %sgpr84) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR85}"(i32 %sgpr85) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR86}"(i32 %sgpr86) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR87}"(i32 %sgpr87) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR88}"(i32 %sgpr88) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR89}"(i32 %sgpr89) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR90}"(i32 %sgpr90) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR91}"(i32 %sgpr91) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR92}"(i32 %sgpr92) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR93}"(i32 %sgpr93) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR94}"(i32 %sgpr94) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR95}"(i32 %sgpr95) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR96}"(i32 %sgpr96) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR97}"(i32 %sgpr97) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR98}"(i32 %sgpr98) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR99}"(i32 %sgpr99) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR100}"(i32 %sgpr100) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR101}"(i32 %sgpr101) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR102}"(i32 %sgpr102) #0 + tail call void asm sideeffect "; reg use $0", "{SGPR103}"(i32 %sgpr103) #0 + tail call void asm sideeffect "; reg use $0", "{VCC_LO}"(i32 %vcc_lo) #0 + tail call void asm sideeffect "; reg use $0", "{VCC_HI}"(i32 %vcc_hi) #0 + ret void +} + +attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/branch-relaxation.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/branch-relaxation.ll @@ -0,0 +1,479 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-s-branch-bits=4 < %s | FileCheck -check-prefix=GCN %s +; Restrict maximum branch to between +7 and -8 dwords + +; Used to emit an always 4 byte instruction. Inline asm always assumes +; each instruction is the maximum size. +declare void @llvm.amdgcn.s.sleep(i32) #0 + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + + +; GCN-LABEL: {{^}}uniform_conditional_max_short_forward_branch: +; GCN: s_load_dword [[CND:s[0-9]+]] +; GCN: s_cmp_eq_i32 [[CND]], 0 +; GCN-NEXT: s_cbranch_scc1 [[BB3:BB[0-9]+_[0-9]+]] + + +; GCN-NEXT: ; BB#1: ; %bb2 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_sleep 0 + +; GCN-NEXT: [[BB3]]: ; %bb3 +; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]] +; GCN: buffer_store_dword [[V_CND]] +; GCN: s_endpgm +define void @uniform_conditional_max_short_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 { +bb: + %cmp = icmp eq i32 %cnd, 0 + br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch + +bb2: +; 24 bytes + call void asm sideeffect + "v_nop_e64 + v_nop_e64 + v_nop_e64", ""() #0 + call void @llvm.amdgcn.s.sleep(i32 0) + br label %bb3 + +bb3: + store volatile i32 %cnd, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}uniform_conditional_min_long_forward_branch: +; GCN: s_load_dword [[CND:s[0-9]+]] +; GCN: s_cmp_eq_i32 [[CND]], 0 +; GCN-NEXT: s_cbranch_scc0 [[LONGBB:BB[0-9]+_[0-9]+]] + +; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb0 +; GCN-NEXT: s_getpc_b64 vcc +; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[ENDBB:BB[0-9]+_[0-9]+]]-([[LONG_JUMP]]+4) +; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0 +; GCN-NEXT: s_setpc_b64 vcc + +; GCN-NEXT: [[LONGBB]]: +; GCN-NEXT: ;;#ASMSTART +; GCN: v_nop_e64 +; GCN: v_nop_e64 +; GCN: v_nop_e64 +; GCN: v_nop_e64 +; GCN-NEXT: ;;#ASMEND + +; GCN-NEXT: [[ENDBB]]: +; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]] +; GCN: buffer_store_dword [[V_CND]] +; GCN: s_endpgm +define void @uniform_conditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 { +bb0: + %cmp = icmp eq i32 %cnd, 0 + br i1 %cmp, label %bb3, label %bb2 ; +9 dword branch + +bb2: +; 32 bytes + call void asm sideeffect + "v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64", ""() #0 + br label %bb3 + +bb3: + store volatile i32 %cnd, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}uniform_conditional_min_long_forward_vcnd_branch: +; GCN: s_load_dword [[CND:s[0-9]+]] +; GCN-DAG: v_cmp_eq_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, [[CND]] +; GCN-DAG: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]] +; GCN: s_and_b64 vcc, exec, [[CMP]] +; GCN-NEXT: s_cbranch_vccz [[LONGBB:BB[0-9]+_[0-9]+]] + +; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb0 +; GCN-NEXT: s_getpc_b64 vcc +; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[ENDBB:BB[0-9]+_[0-9]+]]-([[LONG_JUMP]]+4) +; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0 +; GCN-NEXT: s_setpc_b64 vcc + +; GCN-NEXT: [[LONGBB]]: +; GCN: v_nop_e64 +; GCN: v_nop_e64 +; GCN: v_nop_e64 +; GCN: v_nop_e64 + +; GCN: [[ENDBB]]: +; GCN: buffer_store_dword [[V_CND]] +; GCN: s_endpgm +define void @uniform_conditional_min_long_forward_vcnd_branch(float addrspace(1)* %arg, float %cnd) #0 { +bb0: + %cmp = fcmp oeq float %cnd, 0.0 + br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch + +bb2: + call void asm sideeffect " ; 32 bytes + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64", ""() #0 + br label %bb3 + +bb3: + store volatile float %cnd, float addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}min_long_forward_vbranch: + +; GCN: buffer_load_dword +; GCN: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}} +; GCN: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc +; GCN: s_xor_b64 [[SAVE]], exec, [[SAVE]] + +; GCN: v_nop_e64 +; GCN: v_nop_e64 +; GCN: v_nop_e64 +; GCN: v_nop_e64 + +; GCN: s_or_b64 exec, exec, [[SAVE]] +; GCN: buffer_store_dword +; GCN: s_endpgm +define void @min_long_forward_vbranch(i32 addrspace(1)* %arg) #0 { +bb: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = zext i32 %tid to i64 + %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tid.ext + %load = load volatile i32, i32 addrspace(1)* %gep + %cmp = icmp eq i32 %load, 0 + br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch + +bb2: + call void asm sideeffect " ; 32 bytes + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64", ""() #0 + br label %bb3 + +bb3: + store volatile i32 %load, i32 addrspace(1)* %gep + ret void +} + +; FIXME: Should be able to use s_cbranch_scc0 +; GCN-LABEL: {{^}}long_backward_sbranch: +; GCN: v_mov_b32_e32 [[LOOPIDX:v[0-9]+]], 0{{$}} + +; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]: ; %bb2 +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: v_add_i32_e32 [[INC:v[0-9]+]], vcc, 1, [[LOOPIDX]] +; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 10, [[INC]] + +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND + +; GCN-NEXT: s_and_b64 vcc, exec, vcc +; GCN-NEXT: s_cbranch_vccz [[ENDBB:BB[0-9]+_[0-9]+]] + +; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb2 +; GCN-NEXT: ; in Loop: Header=[[LOOPBB]] Depth=1 +; GCN-NEXT: s_getpc_b64 vcc +; GCN-NEXT: s_sub_u32 vcc_lo, vcc_lo, ([[LONG_JUMP]]+4)-[[LOOPBB]] +; GCN-NEXT: s_subb_u32 vcc_hi, vcc_hi, 0 +; GCN-NEXT: s_setpc_b64 vcc + +; GCN-NEXT: [[ENDBB]]: +; GCN-NEXT: s_endpgm +define void @long_backward_sbranch(i32 addrspace(1)* %arg) #0 { +bb: + br label %bb2 + +bb2: + %loop.idx = phi i32 [ 0, %bb ], [ %inc, %bb2 ] + ; 24 bytes + call void asm sideeffect + "v_nop_e64 + v_nop_e64 + v_nop_e64", ""() #0 + %inc = add nsw i32 %loop.idx, 1 ; add cost 4 + %cmp = icmp slt i32 %inc, 10 ; condition cost = 8 + br i1 %cmp, label %bb2, label %bb3 ; - + +bb3: + ret void +} + +; Requires expansion of unconditional branch from %bb2 to %bb4 (and +; expansion of conditional branch from %bb to %bb3. + +; GCN-LABEL: {{^}}uniform_unconditional_min_long_forward_branch: +; GCN: s_cmp_eq_i32 +; GCN-NEXT: s_cbranch_scc0 [[BB2:BB[0-9]+_[0-9]+]] + +; GCN-NEXT: [[LONG_JUMP0:BB[0-9]+_[0-9]+]]: ; %bb0 +; GCN-NEXT: s_getpc_b64 vcc +; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB3:BB[0-9]_[0-9]+]]-([[LONG_JUMP0]]+4) +; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}} +; GCN-NEXT: s_setpc_b64 vcc + +; GCN-NEXT: [[BB2]]: ; %bb2 +; GCN: v_mov_b32_e32 [[BB2_K:v[0-9]+]], 17 +; GCN: buffer_store_dword [[BB2_K]] +; GCN: s_waitcnt vmcnt(0) + +; GCN-NEXT: [[LONG_JUMP1:BB[0-9]+_[0-9]+]]: ; %bb2 +; GCN-NEXT: s_getpc_b64 vcc +; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB4:BB[0-9]_[0-9]+]]-([[LONG_JUMP1]]+4) +; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}} +; GCN-NEXT: s_setpc_b64 vcc + +; GCN: [[BB3]]: ; %bb3 +; GCN: v_nop_e64 +; GCN: v_nop_e64 +; GCN: v_nop_e64 +; GCN: v_nop_e64 +; GCN: ;;#ASMEND + +; GCN-NEXT: [[BB4]]: ; %bb4 +; GCN: v_mov_b32_e32 [[BB4_K:v[0-9]+]], 63 +; GCN: buffer_store_dword [[BB4_K]] +; GCN-NEXT: s_endpgm +; GCN-NEXT: .Lfunc_end{{[0-9]+}}: +define void @uniform_unconditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %arg1) { +bb0: + %tmp = icmp ne i32 %arg1, 0 + br i1 %tmp, label %bb2, label %bb3 + +bb2: + store volatile i32 17, i32 addrspace(1)* undef + br label %bb4 + +bb3: + ; 32 byte asm + call void asm sideeffect + "v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64", ""() #0 + br label %bb4 + +bb4: + store volatile i32 63, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}uniform_unconditional_min_long_backward_branch: +; GCN-NEXT: ; BB#0: ; %entry + +; GCN-NEXT: [[LOOP:BB[0-9]_[0-9]+]]: ; %loop +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND + +; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %loop +; GCN-NEXT: ; in Loop: Header=[[LOOP]] Depth=1 +; GCN-NEXT: s_getpc_b64 vcc +; GCN-NEXT: s_sub_u32 vcc_lo, vcc_lo, ([[LONGBB]]+4)-[[LOOP]] +; GCN-NEXT: s_subb_u32 vcc_hi, vcc_hi, 0{{$}} +; GCN-NEXT: s_setpc_b64 vcc +; GCN-NEXT .Lfunc_end{{[0-9]+}}: +define void @uniform_unconditional_min_long_backward_branch(i32 addrspace(1)* %arg, i32 %arg1) { +entry: + br label %loop + +loop: + ; 32 byte asm + call void asm sideeffect + "v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64", ""() #0 + br label %loop +} + +; Expansion of branch from %bb1 to %bb3 introduces need to expand +; branch from %bb0 to %bb2 + +; GCN-LABEL: {{^}}expand_requires_expand: +; GCN-NEXT: ; BB#0: ; %bb0 +; GCN: s_load_dword +; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 0{{$}} +; GCN-NEXT: s_cbranch_scc0 [[BB1:BB[0-9]+_[0-9]+]] + +; GCN-NEXT: [[LONGBB0:BB[0-9]+_[0-9]+]]: ; %bb0 +; GCN-NEXT: s_getpc_b64 vcc +; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB2:BB[0-9]_[0-9]+]]-([[LONGBB0]]+4) +; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}} +; GCN-NEXT: s_setpc_b64 vcc + +; GCN-NEXT: [[BB1]]: ; %bb1 +; GCN-NEXT: s_load_dword +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_i32 s{{[0-9]+}}, 3{{$}} +; GCN-NEXT: s_cbranch_scc0 [[BB2:BB[0-9]_[0-9]+]] + +; GCN-NEXT: [[LONGBB1:BB[0-9]+_[0-9]+]]: ; %bb1 +; GCN-NEXT: s_getpc_b64 vcc +; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB3:BB[0-9]+_[0-9]+]]-([[LONGBB1]]+4) +; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}} +; GCN-NEXT: s_setpc_b64 vcc + +; GCN-NEXT: [[BB2]]: ; %bb2 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: v_nop_e64 +; GCN-NEXT: ;;#ASMEND + +; GCN-NEXT: [[BB3]]: ; %bb3 +; GCN-NEXT: s_endpgm +define void @expand_requires_expand(i32 %cond0) #0 { +bb0: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %cmp0 = icmp slt i32 %cond0, 0 + br i1 %cmp0, label %bb2, label %bb1 + +bb1: + %val = load volatile i32, i32 addrspace(2)* undef + %cmp1 = icmp eq i32 %val, 3 + br i1 %cmp1, label %bb3, label %bb2 + +bb2: + call void asm sideeffect + "v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64", ""() #0 + br label %bb3 + +bb3: + ret void +} + +; Requires expanding of required skip branch. + +; GCN-LABEL: {{^}}uniform_inside_divergent: +; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}} +; GCN-NEXT: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc +; GCN-NEXT: s_xor_b64 [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]] +; GCN-NEXT: ; mask branch [[ENDIF:BB[0-9]+_[0-9]+]] +; GCN-NEXT: s_cbranch_execnz [[IF:BB[0-9]+_[0-9]+]] + +; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %entry +; GCN-NEXT: s_getpc_b64 vcc +; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB2:BB[0-9]_[0-9]+]]-([[LONGBB]]+4) +; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}} +; GCN-NEXT: s_setpc_b64 vcc + +; GCN-NEXT: [[IF]]: ; %if +; GCN: buffer_store_dword +; GCN: s_cmp_lg_i32 +; GCN: s_cbranch_scc1 [[ENDIF]] + +; GCN-NEXT: ; BB#2: ; %if_uniform +; GCN: buffer_store_dword +; GCN: s_waitcnt vmcnt(0) + +; GCN-NEXT: [[ENDIF]]: ; %endif +; GCN-NEXT: s_or_b64 exec, exec, [[MASK]] +; GCN-NEXT: s_endpgm +define void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) #0 { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %d_cmp = icmp ult i32 %tid, 16 + br i1 %d_cmp, label %if, label %endif + +if: + store i32 0, i32 addrspace(1)* %out + %u_cmp = icmp eq i32 %cond, 0 + br i1 %u_cmp, label %if_uniform, label %endif + +if_uniform: + store i32 1, i32 addrspace(1)* %out + br label %endif + +endif: + ret void +} + +; si_mask_branch +; s_cbranch_execz +; s_branch + +; GCN-LABEL: {{^}}analyze_mask_branch: +; GCN: v_cmp_lt_f32_e32 vcc +; GCN-NEXT: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc +; GCN-NEXT: s_xor_b64 [[MASK]], exec, [[MASK]] +; GCN-NEXT: ; mask branch [[RET:BB[0-9]+_[0-9]+]] +; GCN-NEXT: s_cbranch_execz [[BRANCH_SKIP:BB[0-9]+_[0-9]+]] +; GCN-NEXT: s_branch [[LOOP_BODY:BB[0-9]+_[0-9]+]] + +; GCN-NEXT: [[BRANCH_SKIP]]: ; %entry +; GCN-NEXT: s_getpc_b64 vcc +; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[RET]]-([[BRANCH_SKIP]]+4) +; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0 +; GCN-NEXT: s_setpc_b64 vcc + +; GCN-NEXT: [[LOOP_BODY]]: ; %loop_body +; GCN: v_nop_e64 +; GCN: v_nop_e64 +; GCN: v_nop_e64 +; GCN: v_nop_e64 +; GCN: v_nop_e64 +; GCN: v_nop_e64 +; GCN: ;;#ASMEND +; GCN-NEXT: s_and_b64 vcc, exec, -1{{$}} +; GCN-NEXT: s_cbranch_vccz [[RET]] + +; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %loop_body +; GCN-NEXT: ; in Loop: Header=[[LOOP_BODY]] Depth=1 +; GCN-NEXT: s_getpc_b64 vcc +; GCN-NEXT: s_sub_u32 vcc_lo, vcc_lo, ([[LONGBB]]+4)-[[LOOP_BODY]] +; GCN-NEXT: s_subb_u32 vcc_hi, vcc_hi, 0 +; GCN-NEXT: s_setpc_b64 vcc + +; GCN-NEXT: [[RET]]: ; %Flow +; GCN-NEXT: s_or_b64 exec, exec, [[MASK]] +; GCN: buffer_store_dword +; GCN-NEXT: s_endpgm +define void @analyze_mask_branch() #0 { +entry: + %reg = call float asm sideeffect "v_mov_b32_e64 $0, 0", "=v"() + %cmp0 = fcmp ogt float %reg, 0.000000e+00 + br i1 %cmp0, label %loop, label %ret + +loop: + %phi = phi float [ 0.000000e+00, %loop_body ], [ 1.000000e+00, %entry ] + call void asm sideeffect + "v_nop_e64 + v_nop_e64", ""() #0 + %cmp1 = fcmp olt float %phi, 8.0 + br i1 %cmp1, label %loop_body, label %ret + +loop_body: + call void asm sideeffect + "v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64", ""() #0 + br label %loop + +ret: + store volatile i32 7, i32 addrspace(1)* undef + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll =================================================================== --- test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll +++ test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll @@ -4,24 +4,27 @@ ; GCN: v_cmp_eq_i32 ; GCN: s_and_saveexec_b64 ; GCN: s_xor_b64 -; GCN: s_branch BB0_1 +; GCN: ; mask branch [[RET:BB[0-9]+]] +; GCN: s_branch [[UNREACHABLE:BB[0-9]+_[0-9]+]] +; GCN: [[RET]] ; GCN: s_or_b64 exec, exec ; GCN: s_endpgm +; GCN: [[UNREACHABLE]]: ; GCN: ds_write_b32 ; GCN: s_waitcnt define void @lower_control_flow_unreachable_terminator() #0 { bb: %tmp15 = tail call i32 @llvm.amdgcn.workitem.id.y() %tmp63 = icmp eq i32 %tmp15, 32 - br i1 %tmp63, label %bb64, label %bb68 + br i1 %tmp63, label %unreachable, label %ret -bb64: +unreachable: store volatile i32 0, i32 addrspace(3)* undef, align 4 unreachable -bb68: +ret: ret void } @@ -29,21 +32,25 @@ ; GCN: v_cmp_eq_i32 ; GCN: s_and_saveexec_b64 ; GCN: s_xor_b64 -; GCN: s_endpgm +; GCN: ; mask branch [[UNREACHABLE:BB[0-9]+_[0-9]+]] -; GCN: s_or_b64 exec, exec +; GCN-NEXT: ; %ret +; GCN-NEXT: s_endpgm + +; GCN-NEXT: [[UNREACHABLE]]: +; GCN-NEXT: s_or_b64 exec, exec ; GCN: ds_write_b32 ; GCN: s_waitcnt define void @lower_control_flow_unreachable_terminator_swap_block_order() #0 { bb: %tmp15 = tail call i32 @llvm.amdgcn.workitem.id.y() %tmp63 = icmp eq i32 %tmp15, 32 - br i1 %tmp63, label %bb68, label %bb64 + br i1 %tmp63, label %ret, label %unreachable -bb68: +ret: ret void -bb64: +unreachable: store volatile i32 0, i32 addrspace(3)* undef, align 4 unreachable } Index: test/CodeGen/AMDGPU/skip-if-dead.ll =================================================================== --- test/CodeGen/AMDGPU/skip-if-dead.ll +++ test/CodeGen/AMDGPU/skip-if-dead.ll @@ -105,7 +105,7 @@ ; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7 ; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]] -; CHECK-NEXT: ; BB#3: +; CHECK-NEXT: ; BB#2: ; CHECK-NEXT: exp 0, 9, 0, 1, 1, v0, v0, v0, v0 ; CHECK-NEXT: s_endpgm @@ -156,7 +156,7 @@ ; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7 ; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]] -; CHECK-NEXT: ; BB#4: +; CHECK-NEXT: ; BB#2: ; CHECK-NEXT: exp 0, 9, 0, 1, 1, v0, v0, v0, v0 ; CHECK-NEXT: s_endpgm @@ -270,7 +270,7 @@ ; CHECK: s_and_b64 vcc, exec, vcc ; CHECK: s_cbranch_vccz [[ENDBB:BB[0-9]+_[0-9]+]] -; CHECK: ; BB#3: ; %bb10 +; CHECK: ; %bb10 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 9 ; CHECK: buffer_store_dword @@ -306,7 +306,7 @@ ; CHECK: s_and_b64 vcc, exec, ; CHECK: s_cbranch_vccz [[SKIPKILL:BB[0-9]+_[0-9]+]] -; CHECK: ; BB#3: ; %bb6 +; CHECK: ; %bb6 ; CHECK: s_mov_b64 exec, 0 ; CHECK: [[SKIPKILL]]: