Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1214,6 +1214,12 @@ MI.setDesc(get(AMDGPU::S_XOR_B64)); break; + case AMDGPU::S_OR_B64_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_OR_B64)); + break; + case AMDGPU::S_ANDN2_B64_term: // This is only a terminator to get the correct spill code placement during // register allocation. @@ -1698,6 +1704,7 @@ case AMDGPU::SI_MASK_BRANCH: case AMDGPU::S_MOV_B64_term: case AMDGPU::S_XOR_B64_term: + case AMDGPU::S_OR_B64_term: case AMDGPU::S_ANDN2_B64_term: break; case AMDGPU::SI_IF: Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -189,6 +189,7 @@ } def S_MOV_B64_term : WrapTerminatorInst; +def S_OR_B64_term : WrapTerminatorInst; def S_XOR_B64_term : WrapTerminatorInst; def S_ANDN2_B64_term : WrapTerminatorInst; Index: lib/Target/AMDGPU/SILowerControlFlow.cpp =================================================================== --- lib/Target/AMDGPU/SILowerControlFlow.cpp +++ lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -55,6 +55,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" @@ -79,8 +80,11 @@ private: const SIRegisterInfo *TRI = nullptr; const SIInstrInfo *TII = nullptr; - LiveIntervals *LIS = nullptr; MachineRegisterInfo *MRI = nullptr; + LiveIntervals *LIS = nullptr; + MachineDominatorTree *DT = nullptr; + MachineLoopInfo *MLI = nullptr; + void emitIf(MachineInstr &MI); void emitElse(MachineInstr &MI); @@ -111,7 +115,7 @@ AU.addPreservedID(LiveVariablesID); AU.addPreservedID(MachineLoopInfoID); AU.addPreservedID(MachineDominatorsID); - AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); } }; @@ -388,23 +392,99 @@ MI.eraseFromParent(); } +// Insert \p Inst (which modifies exec) at \p InsPt in \p MBB, such that \p MBB +// is split as necessary to keep the exec modification in its own block. +static MachineBasicBlock *insertInstWithExecFallthrough(MachineBasicBlock &MBB, + MachineInstr &MI, + MachineInstr *NewMI, + MachineDominatorTree *DT, + LiveIntervals *LIS, + MachineLoopInfo *MLI) { + assert(NewMI->isTerminator()); + + MachineBasicBlock::iterator InsPt = MI.getIterator(); + if (std::next(MI.getIterator()) == MBB.end()) { + // Don't bother with a new block. + MBB.insert(InsPt, NewMI); + if (LIS) + LIS->ReplaceMachineInstrInMaps(MI, *NewMI); + MI.eraseFromParent(); + return &MBB; + } + + MachineFunction *MF = MBB.getParent(); + MachineBasicBlock *SplitMBB + = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + + MF->insert(++MachineFunction::iterator(MBB), SplitMBB); + + // FIXME: This is working around a MachineDominatorTree API defect. + // + // If a previous pass split a critical edge, it may not have been applied to + // the DomTree yet. applySplitCriticalEdges is lazily applied, and inspects + // the CFG of the given block. Make sure to call a dominator tree method that + // will flush this cache before touching the successors of the block. + MachineDomTreeNode *NodeMBB = nullptr; + if (DT) + NodeMBB = DT->getNode(&MBB); + + // Move everything to the new block, except the end_cf pseudo. + SplitMBB->splice(SplitMBB->begin(), &MBB, MBB.begin(), MBB.end()); + + SplitMBB->transferSuccessorsAndUpdatePHIs(&MBB); + MBB.addSuccessor(SplitMBB, BranchProbability::getOne()); + + MBB.insert(MBB.end(), NewMI); + + if (DT) { + std::vector Children = NodeMBB->getChildren(); + DT->addNewBlock(SplitMBB, &MBB); + + // Reparent all of the children to the new block body. + auto *SplitNode = DT->getNode(SplitMBB); + for (auto *Child : Children) + DT->changeImmediateDominator(Child, SplitNode); + } + + if (MLI) { + if (MachineLoop *Loop = MLI->getLoopFor(&MBB)) + Loop->addBasicBlockToLoop(SplitMBB, MLI->getBase()); + } + + if (LIS) { + LIS->insertMBBInMaps(SplitMBB); + LIS->ReplaceMachineInstrInMaps(MI, *NewMI); + } + + // All live-ins are forwarded. + for (auto &LiveIn : MBB.liveins()) + SplitMBB->addLiveIn(LiveIn); + + MI.eraseFromParent(); + return SplitMBB; +} + void SILowerControlFlow::emitEndCf(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); MachineBasicBlock::iterator InsPt = MBB.begin(); - MachineInstr *NewMI = - BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .add(MI.getOperand(0)); + // First, move the instruction. It's unnecessarily difficult to update + // LiveIntervals when there's a change in control flow, so move the + // instruction before changing the blocks. + MBB.splice(InsPt, &MBB, MI.getIterator()); if (LIS) - LIS->ReplaceMachineInstrInMaps(MI, *NewMI); + LIS->handleMove(MI); - MI.eraseFromParent(); + MachineFunction *MF = MBB.getParent(); - if (LIS) - LIS->handleMove(*NewMI); + // Create instruction without inserting it yet. + MachineInstr *NewMI + = BuildMI(*MF, DL, TII->get(AMDGPU::S_OR_B64_term), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .add(MI.getOperand(0)); + insertInstWithExecFallthrough(MBB, MI, NewMI, DT, LIS, MLI); } // Returns replace operands for a logical operation, either single result @@ -470,17 +550,20 @@ // This doesn't actually need LiveIntervals, but we can preserve them. LIS = getAnalysisIfAvailable(); + DT = getAnalysisIfAvailable(); + MLI = getAnalysisIfAvailable(); + MRI = &MF.getRegInfo(); MachineFunction::iterator NextBB; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; BI = NextBB) { NextBB = std::next(BI); - MachineBasicBlock &MBB = *BI; + MachineBasicBlock *MBB = &*BI; MachineBasicBlock::iterator I, Next, Last; - for (I = MBB.begin(), Last = MBB.end(); I != MBB.end(); I = Next) { + for (I = MBB->begin(), Last = MBB->end(); I != MBB->end(); I = Next) { Next = std::next(I); MachineInstr &MI = *I; @@ -501,10 +584,24 @@ emitLoop(MI); break; - case AMDGPU::SI_END_CF: + case AMDGPU::SI_END_CF: { + MachineInstr *NextMI = nullptr; + + if (Next != MBB->end()) + NextMI = &*Next; + emitEndCf(MI); - break; + if (NextMI) { + MBB = NextMI->getParent(); + Next = NextMI->getIterator(); + Last = MBB->end(); + } + + NextBB = std::next(MBB->getIterator()); + BE = MF.end(); + break; + } case AMDGPU::S_AND_B64: case AMDGPU::S_OR_B64: // Cleanup bit manipulations on exec mask @@ -518,7 +615,7 @@ } // Replay newly inserted code to combine masks - Next = (Last == MBB.end()) ? MBB.begin() : Last; + Next = (Last == MBB->end()) ? MBB->begin() : Last; } } Index: lib/Target/AMDGPU/SIOptimizeExecMasking.cpp =================================================================== --- lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -149,6 +149,12 @@ MI.setDesc(TII.get(AMDGPU::S_XOR_B64)); return true; } + case AMDGPU::S_OR_B64_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(TII.get(AMDGPU::S_OR_B64)); + return true; + } case AMDGPU::S_ANDN2_B64_term: { // This is only a terminator to get the correct spill code placement during // register allocation. Index: lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp =================================================================== --- lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp +++ lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -83,7 +83,7 @@ } static bool isEndCF(const MachineInstr& MI, const SIRegisterInfo* TRI) { - return MI.getOpcode() == AMDGPU::S_OR_B64 && + return MI.getOpcode() == AMDGPU::S_OR_B64_term && MI.modifiesRegister(AMDGPU::EXEC, TRI); } @@ -362,7 +362,7 @@ // Try to collapse adjacent endifs. auto E = MBB.end(); - auto Lead = skipDebugInstructionsForward(MBB.begin(), E); + auto Lead = MBB.getFirstTerminator(); if (MBB.succ_size() != 1 || Lead == E || !isEndCF(*Lead, TRI)) continue; Index: test/CodeGen/AMDGPU/collapse-endcf.mir =================================================================== --- test/CodeGen/AMDGPU/collapse-endcf.mir +++ test/CodeGen/AMDGPU/collapse-endcf.mir @@ -49,8 +49,10 @@ ; GCN: successors: %bb.4(0x80000000) ; GCN: DBG_VALUE ; GCN: bb.4: + ; GCN: successors: %bb.5(0x80000000) ; GCN: DBG_VALUE - ; GCN: $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc + ; GCN: $exec = S_OR_B64_term $exec, [[COPY2]], implicit-def $scc + ; GCN: bb.5: ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GCN: $m0 = S_MOV_B32 -1 @@ -95,12 +97,14 @@ BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: - $exec = S_OR_B64 $exec, %12, implicit-def $scc DBG_VALUE + $exec = S_OR_B64_term $exec, %12, implicit-def $scc bb.4: DBG_VALUE - $exec = S_OR_B64 $exec, %3, implicit-def $scc + $exec = S_OR_B64_term $exec, %3, implicit-def $scc + + bb.5: %15:vgpr_32 = V_MOV_B32_e32 3, implicit $exec %16:vgpr_32 = V_MOV_B32_e32 0, implicit $exec $m0 = S_MOV_B32 -1 @@ -121,7 +125,7 @@ body: | ; GCN-LABEL: name: simple_nested_if_empty_block_between ; GCN: bb.0: - ; GCN: successors: %bb.1(0x40000000), %bb.5(0x40000000) + ; GCN: successors: %bb.1(0x40000000), %bb.4(0x40000000) ; GCN: liveins: $vgpr0, $sgpr0_sgpr1 ; GCN: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1 ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -129,7 +133,7 @@ ; GCN: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec ; GCN: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_LT_U32_e64_]], implicit-def dead $scc ; GCN: $exec = S_MOV_B64_term [[S_AND_B64_]] - ; GCN: SI_MASK_BRANCH %bb.5, implicit $exec + ; GCN: SI_MASK_BRANCH %bb.4, implicit $exec ; GCN: S_BRANCH %bb.1 ; GCN: bb.1: ; GCN: successors: %bb.2(0x40000000), %bb.3(0x40000000) @@ -158,7 +162,9 @@ ; GCN: bb.4: ; GCN: successors: %bb.5(0x80000000) ; GCN: bb.5: - ; GCN: $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc + ; GCN: successors: %bb.6(0x80000000) + ; GCN: $exec = S_OR_B64_term $exec, [[COPY2]], implicit-def $scc + ; GCN: bb.6: ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GCN: $m0 = S_MOV_B32 -1 @@ -203,12 +209,14 @@ BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: - $exec = S_OR_B64 $exec, %12, implicit-def $scc + $exec = S_OR_B64_term $exec, %12, implicit-def $scc + + bb.4: bb.5: + $exec = S_OR_B64_term $exec, %3, implicit-def $scc - bb.4: - $exec = S_OR_B64 $exec, %3, implicit-def $scc + bb.6: %15:vgpr_32 = V_MOV_B32_e32 3, implicit $exec %16:vgpr_32 = V_MOV_B32_e32 0, implicit $exec $m0 = S_MOV_B32 -1 @@ -229,7 +237,7 @@ body: | ; GCN-LABEL: name: simple_nested_if_empty_block_dbg_between ; GCN: bb.0: - ; GCN: successors: %bb.1(0x40000000), %bb.5(0x40000000) + ; GCN: successors: %bb.1(0x40000000), %bb.4(0x40000000) ; GCN: liveins: $vgpr0, $sgpr0_sgpr1 ; GCN: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1 ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -267,7 +275,9 @@ ; GCN: successors: %bb.5(0x80000000) ; GCN: DBG_VALUE ; GCN: bb.5: - ; GCN: $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc + ; GCN: successors: %bb.6(0x80000000) + ; GCN: $exec = S_OR_B64_term $exec, [[COPY2]], implicit-def $scc + ; GCN: bb.6: ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GCN: $m0 = S_MOV_B32 -1 @@ -283,7 +293,7 @@ %3:sreg_64 = COPY $exec, implicit-def $exec %4:sreg_64 = S_AND_B64 %3, %2, implicit-def dead $scc $exec = S_MOV_B64_term %4 - SI_MASK_BRANCH %bb.4, implicit $exec + SI_MASK_BRANCH %bb.5, implicit $exec S_BRANCH %bb.1 bb.1: @@ -312,13 +322,15 @@ BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: - $exec = S_OR_B64 $exec, %12, implicit-def $scc + $exec = S_OR_B64_term $exec, %12, implicit-def $scc - bb.5: + bb.4: DBG_VALUE - bb.4: - $exec = S_OR_B64 $exec, %3, implicit-def $scc + bb.5: + $exec = S_OR_B64_term $exec, %3, implicit-def $scc + + bb.6: %15:vgpr_32 = V_MOV_B32_e32 3, implicit $exec %16:vgpr_32 = V_MOV_B32_e32 0, implicit $exec $m0 = S_MOV_B32 -1 @@ -360,8 +372,7 @@ ; GCN: %5.sub2:sgpr_128 = S_MOV_B32 0 ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec - ; GCN: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY4]], [[V_CMP_NE_U32_e64_]], implicit-def dead $scc + ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc ; GCN: $exec = S_MOV_B64_term [[S_AND_B64_1]] ; GCN: SI_MASK_BRANCH %bb.3, implicit $exec ; GCN: S_BRANCH %bb.2 @@ -376,9 +387,10 @@ ; GCN: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; GCN: dead %16:sgpr_32 = S_BREV_B32 [[DEF]] ; GCN: KILL [[DEF]] - ; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc ; GCN: bb.4: - ; GCN: $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc + ; GCN: successors: %bb.5(0x80000000) + ; GCN: $exec = S_OR_B64_term $exec, [[COPY2]], implicit-def $scc + ; GCN: bb.5: ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GCN: $m0 = S_MOV_B32 -1 @@ -426,10 +438,12 @@ %15:sgpr_32 = IMPLICIT_DEF %16:sgpr_32 = S_BREV_B32 %15 KILL %15 - $exec = S_OR_B64 $exec, %12, implicit-def $scc + $exec = S_OR_B64_term $exec, %12, implicit-def $scc bb.4: - $exec = S_OR_B64 $exec, %3, implicit-def $scc + $exec = S_OR_B64_term $exec, %3, implicit-def $scc + + bb.5: %17:vgpr_32 = V_MOV_B32_e32 3, implicit $exec %18:vgpr_32 = V_MOV_B32_e32 0, implicit $exec $m0 = S_MOV_B32 -1 @@ -475,7 +489,7 @@ ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc ; GCN: $exec = S_MOV_B64_term [[S_AND_B64_1]] - ; GCN: SI_MASK_BRANCH %bb.3, implicit $exec + ; GCN: SI_MASK_BRANCH %bb.4, implicit $exec ; GCN: S_BRANCH %bb.2 ; GCN: bb.2: ; GCN: successors: %bb.3(0x80000000) @@ -485,12 +499,16 @@ ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) + ; GCN: bb.4: + ; GCN: successors: %bb.5(0x80000000) ; GCN: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; GCN: [[S_BREV_B32_:%[0-9]+]]:sgpr_32 = S_BREV_B32 [[DEF]] ; GCN: KILL [[DEF]] ; GCN: dead %17:sgpr_32 = COPY [[S_BREV_B32_]] - ; GCN: bb.4: - ; GCN: $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc + ; GCN: bb.5: + ; GCN: successors: %bb.6(0x80000000) + ; GCN: $exec = S_OR_B64_term $exec, [[COPY2]], implicit-def $scc + ; GCN: bb.6: ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GCN: $m0 = S_MOV_B32 -1 @@ -525,7 +543,7 @@ %12:sreg_64 = COPY $exec, implicit-def $exec %13:sreg_64 = S_AND_B64 %12, %11, implicit-def dead $scc $exec = S_MOV_B64_term %13 - SI_MASK_BRANCH %bb.3, implicit $exec + SI_MASK_BRANCH %bb.4, implicit $exec S_BRANCH %bb.2 bb.2: @@ -535,14 +553,18 @@ BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: - $exec = S_OR_B64 $exec, %12, implicit-def $scc + $exec = S_OR_B64_term $exec, %12, implicit-def $scc + + bb.4: %15:sgpr_32 = IMPLICIT_DEF %16:sgpr_32 = S_BREV_B32 %15 KILL %15 %19:sgpr_32 = COPY %16 - bb.4: - $exec = S_OR_B64 $exec, %3, implicit-def $scc + bb.5: + $exec = S_OR_B64_term $exec, %3, implicit-def $scc + + bb.6: %17:vgpr_32 = V_MOV_B32_e32 3, implicit $exec %18:vgpr_32 = V_MOV_B32_e32 0, implicit $exec $m0 = S_MOV_B32 -1 @@ -598,10 +620,14 @@ ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) - ; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc - ; GCN: dead %15:sreg_64 = S_BREV_B64 $exec + ; GCN: $exec = S_OR_B64_term $exec, [[COPY4]], implicit-def $scc ; GCN: bb.4: - ; GCN: $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc + ; GCN: successors: %bb.5(0x80000000) + ; GCN: dead %15:sreg_64 = S_BREV_B64 $exec + ; GCN: bb.5: + ; GCN: successors: %bb.6(0x80000000) + ; GCN: $exec = S_OR_B64_term $exec, [[COPY2]], implicit-def $scc + ; GCN: bb.6: ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GCN: $m0 = S_MOV_B32 -1 @@ -646,11 +672,15 @@ BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: - $exec = S_OR_B64 $exec, %12, implicit-def $scc - %15:sreg_64 = S_BREV_B64 $exec + $exec = S_OR_B64_term $exec, %12, implicit-def $scc bb.4: - $exec = S_OR_B64 $exec, %3, implicit-def $scc + %15:sreg_64 = S_BREV_B64 $exec + + bb.5: + $exec = S_OR_B64_term $exec, %3, implicit-def $scc + + bb.6: %17:vgpr_32 = V_MOV_B32_e32 3, implicit $exec %18:vgpr_32 = V_MOV_B32_e32 0, implicit $exec $m0 = S_MOV_B32 -1 @@ -671,7 +701,7 @@ body: | ; GCN-LABEL: name: copy_no_explicit_exec_dependency ; GCN: bb.0: - ; GCN: successors: %bb.1(0x40000000), %bb.4(0x40000000) + ; GCN: successors: %bb.1(0x40000000), %bb.5(0x40000000) ; GCN: liveins: $vgpr0, $sgpr0_sgpr1 ; GCN: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1 ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -679,7 +709,7 @@ ; GCN: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec ; GCN: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_LT_U32_e64_]], implicit-def dead $scc ; GCN: $exec = S_MOV_B64_term [[S_AND_B64_]] - ; GCN: SI_MASK_BRANCH %bb.4, implicit $exec + ; GCN: SI_MASK_BRANCH %bb.5, implicit $exec ; GCN: S_BRANCH %bb.1 ; GCN: bb.1: ; GCN: successors: %bb.2(0x40000000), %bb.3(0x40000000) @@ -706,17 +736,21 @@ ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) - ; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc - ; GCN: dead %15:vgpr_32 = COPY %5.sub2 + ; GCN: $exec = S_OR_B64_term $exec, [[COPY4]], implicit-def $scc ; GCN: bb.4: - ; GCN: $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc + ; GCN: successors: %bb.5(0x80000000) + ; GCN: dead %15:vgpr_32 = COPY %5.sub2 + ; GCN: bb.5: + ; GCN: successors: %bb.6(0x80000000) + ; GCN: $exec = S_OR_B64_term $exec, [[COPY2]], implicit-def $scc + ; GCN: bb.6: ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GCN: $m0 = S_MOV_B32 -1 ; GCN: DS_WRITE_B32 [[V_MOV_B32_e32_2]], [[V_MOV_B32_e32_1]], 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3) ; GCN: S_ENDPGM 0 bb.0: - successors: %bb.1, %bb.4 + successors: %bb.1, %bb.5 liveins: $vgpr0, $sgpr0_sgpr1 %1:sgpr_64 = COPY $sgpr0_sgpr1 @@ -725,7 +759,7 @@ %3:sreg_64 = COPY $exec, implicit-def $exec %4:sreg_64 = S_AND_B64 %3, %2, implicit-def dead $scc $exec = S_MOV_B64_term %4 - SI_MASK_BRANCH %bb.4, implicit $exec + SI_MASK_BRANCH %bb.5, implicit $exec S_BRANCH %bb.1 bb.1: @@ -754,11 +788,15 @@ BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: - $exec = S_OR_B64 $exec, %12, implicit-def $scc - %15:vgpr_32 = COPY %5.sub2 + $exec = S_OR_B64_term $exec, %12, implicit-def $scc bb.4: - $exec = S_OR_B64 $exec, %3, implicit-def $scc + %15:vgpr_32 = COPY %5.sub2 + + bb.5: + $exec = S_OR_B64_term $exec, %3, implicit-def $scc + + bb.6: %17:vgpr_32 = V_MOV_B32_e32 3, implicit $exec %18:vgpr_32 = V_MOV_B32_e32 0, implicit $exec $m0 = S_MOV_B32 -1 @@ -813,17 +851,19 @@ ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: - ; GCN: successors: %bb.5(0x80000000) - ; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc - ; GCN: S_BRANCH %bb.5 + ; GCN: successors: %bb.6(0x80000000) + ; GCN: $exec = S_OR_B64_term $exec, [[COPY4]], implicit-def $scc + ; GCN: S_BRANCH %bb.6 ; GCN: bb.4: - ; GCN: $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc + ; GCN: successors: %bb.5(0x80000000) + ; GCN: $exec = S_OR_B64_term $exec, [[COPY2]], implicit-def $scc + ; GCN: bb.5: ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GCN: $m0 = S_MOV_B32 -1 ; GCN: DS_WRITE_B32 [[V_MOV_B32_e32_2]], [[V_MOV_B32_e32_1]], 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3) ; GCN: S_ENDPGM 0 - ; GCN: bb.5: + ; GCN: bb.6: ; GCN: successors: %bb.4(0x80000000) ; GCN: S_BRANCH %bb.4 bb.0: @@ -865,18 +905,20 @@ BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: - $exec = S_OR_B64 $exec, %12, implicit-def $scc - S_BRANCH %bb.5 + $exec = S_OR_B64_term $exec, %12, implicit-def $scc + S_BRANCH %bb.6 bb.4: - $exec = S_OR_B64 $exec, %3, implicit-def $scc + $exec = S_OR_B64_term $exec, %3, implicit-def $scc + + bb.5: %15:vgpr_32 = V_MOV_B32_e32 3, implicit $exec %16:vgpr_32 = V_MOV_B32_e32 0, implicit $exec $m0 = S_MOV_B32 -1 DS_WRITE_B32 %16, %15, 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3) S_ENDPGM 0 - bb.5: + bb.6: S_BRANCH %bb.4 ... Index: test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll @@ -15,13 +15,14 @@ } ; FUNC-LABEL: {{^}}ds_ordered_swap_conditional: -; GCN: v_cmp_ne_u32_e32 vcc, 0, v0 +; GCN: v_mov_b32_e32 v1, v0 +; GCN: v_cmp_ne_u32_e32 vcc, 0, v1 ; GCN: s_and_saveexec_b64 s[[SAVED:\[[0-9]+:[0-9]+\]]], vcc ; // We have to use s_cbranch, because ds_ordered_count has side effects with EXEC=0 ; GCN: s_cbranch_execz [[BB:BB._.]] ; GCN: s_mov_b32 m0, s0 ; VIGFX9-NEXT: s_nop 0 -; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v0 offset:4868 gds +; GCN-NEXT: ds_ordered_count v0, v1 offset:4868 gds ; GCN-NEXT: [[BB]]: ; // Wait for expcnt(0) before modifying EXEC ; GCN-NEXT: s_waitcnt expcnt(0)