Index: include/llvm/CodeGen/MachineInstr.h =================================================================== --- include/llvm/CodeGen/MachineInstr.h +++ include/llvm/CodeGen/MachineInstr.h @@ -70,7 +70,11 @@ FrameDestroy = 1 << 1, // Instruction is used as a part of // function frame destruction code. BundledPred = 1 << 2, // Instruction has bundled predecessors. - BundledSucc = 1 << 3 // Instruction has bundled successors. + BundledSucc = 1 << 3, // Instruction has bundled successors. + Initiator = 1 << 4, // Instruction is used as a part of + // target-specific basic block prolog + Terminator = 1 << 5 // Instruction is used as a part of + // target-specific basic block epilog }; private: const MCInstrDesc *MCID; // Instruction descriptor. @@ -445,13 +449,20 @@ return hasProperty(MCID::Barrier, Type); } + /// Returns true if this instruction is part of the initiator for a basic + /// block. This can be used by targets that have non-uniform control flow + /// to set up execution masks. + bool isInitiator() const { + return getFlag(Initiator); // TODO: QueryType? + } + /// Returns true if this instruction part of the terminator for a basic block. /// Typically this is things like return and branch instructions. /// /// Various passes use this to insert code into the bottom of a basic block, /// but before control flow occurs. bool isTerminator(QueryType Type = AnyInBundle) const { - return hasProperty(MCID::Terminator, Type); + return hasProperty(MCID::Terminator, Type) || getFlag(Terminator); // TODO: QueryType? } /// Returns true if this is a conditional, unconditional, or indirect branch. Index: lib/CodeGen/MachineInstr.cpp =================================================================== --- lib/CodeGen/MachineInstr.cpp +++ lib/CodeGen/MachineInstr.cpp @@ -1929,19 +1929,25 @@ } bool HaveSemi = false; - const unsigned PrintableFlags = FrameSetup | FrameDestroy; + const unsigned PrintableFlags = FrameSetup | FrameDestroy | Initiator | Terminator; if (Flags & PrintableFlags) { if (!HaveSemi) { OS << ";"; HaveSemi = true; } - OS << " flags: "; + OS << " flags:"; if (Flags & FrameSetup) - OS << "FrameSetup"; + OS << " FrameSetup"; if (Flags & FrameDestroy) - OS << "FrameDestroy"; + OS << " FrameDestroy"; + + if (Flags & Initiator) + OS << " Initiator"; + + if (Flags & Terminator) + OS << " Terminator"; } if (!memoperands_empty()) { Index: lib/CodeGen/MachineVerifier.cpp =================================================================== --- lib/CodeGen/MachineVerifier.cpp +++ lib/CodeGen/MachineVerifier.cpp @@ -81,6 +81,7 @@ typedef SmallPtrSet BlockSet; const MachineInstr *FirstTerminator; + bool SeenNonInitiator; BlockSet FunctionBlocks; BitVector regsReserved; @@ -573,6 +574,7 @@ void MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) { FirstTerminator = nullptr; + SeenNonInitiator = false; if (!MF->getProperties().hasProperty( MachineFunctionProperties::Property::NoPHIs)) { @@ -788,6 +790,13 @@ lastIndex = idx; } + // Ensure initiators don't follow non-initiators. + if (!MI->isInitiator()) { + SeenNonInitiator = true; + } else if (SeenNonInitiator) { + report("Initiator instruction after a non-initiator", MI); + } + // Ensure non-terminators don't follow terminators. // Ignore predicated terminators formed by if conversion. // FIXME: If conversion shouldn't need to violate this rule. Index: lib/Target/AMDGPU/SIInsertWaits.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertWaits.cpp +++ lib/Target/AMDGPU/SIInsertWaits.cpp @@ -357,12 +357,6 @@ MachineBasicBlock::iterator I, const Counters &Required) { - // End of program? No need to wait on anything - // A function not returning void needs to wait, because other bytecode will - // be appended after it and we don't know what it will be. - if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM && ReturnsVoid) - return false; - // Figure out if the async instructions execute in order bool Ordered[3]; @@ -409,11 +403,17 @@ ExpInstrTypesSeen = 0; // Build the wait instruction - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + MachineInstr *Wait = + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) .addImm((Counts.Named.VM & 0xF) | ((Counts.Named.EXP & 0x7) << 4) | ((Counts.Named.LGKM & 0xF) << 8)); + if (MachineInstr *Prev = Wait->getPrevNode()) { + if (Prev->isTerminator()) + Wait->setFlag(MachineInstr::Terminator); + } + LastOpcodeType = OTHER; LastInstWritesM0 = false; return true; @@ -585,12 +585,13 @@ Counters Required; - // Wait for everything before a barrier. + // Wait for everything before a branch or barrier. // // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish, // but we also want to wait for any other outstanding transfers before // signalling other hardware blocks - if (I->getOpcode() == AMDGPU::S_BARRIER || + if (I->isBranch() || I->getOpcode() == AMDGPU::SI_MASK_BRANCH || + I->getOpcode() == AMDGPU::S_BARRIER || I->getOpcode() == AMDGPU::S_SENDMSG) Required = LastIssued; else @@ -607,8 +608,11 @@ handleSendMsg(MBB, I); } - // Wait for everything at the end of the MBB - Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); + // Wait for everything at the end of the MBB, in case there are no + // branches. No need to wait at the end of the (void-returning) program, + // since the hardware does so automatically. + if (!MBB.empty() && MBB.back().getOpcode() != AMDGPU::S_ENDPGM) + Changes |= insertWait(MBB, MBB.end(), LastIssued); } for (MachineInstr *I : RemoveMI) Index: lib/Target/AMDGPU/SILowerControlFlow.cpp =================================================================== --- lib/Target/AMDGPU/SILowerControlFlow.cpp +++ lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -125,12 +125,14 @@ MachineInstr *AndSaveExec = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), SaveExecReg) - .addOperand(Cond); + .addOperand(Cond) + .setMIFlag(MachineInstr::Terminator); MachineInstr *Xor = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg) .addReg(AMDGPU::EXEC) - .addReg(SaveExecReg); + .addReg(SaveExecReg) + .setMIFlag(MachineInstr::Terminator); // Insert a pseudo terminator to help keep the verifier happy. This will also // be used later when inserting skips. @@ -171,7 +173,8 @@ // else. MachineInstr *OrSaveExec = BuildMI(MBB, Start, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), DstReg) - .addOperand(MI.getOperand(1)); // Saved EXEC + .addOperand(MI.getOperand(1)) // Saved EXEC + .setMIFlag(MachineInstr::Initiator); MachineBasicBlock *DestBB = MI.getOperand(2).getMBB(); MachineBasicBlock::iterator ElsePt(MI); @@ -180,7 +183,8 @@ MachineInstr *And = BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_AND_B64), DstReg) .addReg(AMDGPU::EXEC) - .addReg(DstReg); + .addReg(DstReg) + .setMIFlag(MachineInstr::Terminator); if (LIS) LIS->InsertMachineInstrInMaps(*And); @@ -189,12 +193,13 @@ MachineInstr *Xor = BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) .addReg(AMDGPU::EXEC) - .addReg(DstReg); + .addReg(DstReg) + .setMIFlag(MachineInstr::Terminator); - MachineBasicBlock::iterator Term = MBB.getFirstTerminator(); - // Insert a pseudo terminator to help keep the verifier happy. + // Insert an additional pseudo terminator to help keep the verifier happy + // and mark the location for skips to be inserted later. MachineInstr *Branch = - BuildMI(MBB, Term, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) + BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) .addMBB(DestBB); if (!LIS) { @@ -248,7 +253,8 @@ MachineInstr *AndN2 = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC) .addReg(AMDGPU::EXEC) - .addOperand(MI.getOperand(0)); + .addOperand(MI.getOperand(0)) + .setMIFlag(MachineInstr::Terminator); MachineInstr *Branch = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) @@ -270,7 +276,8 @@ MachineInstr *NewMI = BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC) .addReg(AMDGPU::EXEC) - .addOperand(MI.getOperand(0)); + .addOperand(MI.getOperand(0)) + .setMIFlag(MachineInstr::Initiator); if (LIS) LIS->ReplaceMachineInstrInMaps(MI, *NewMI); Index: test/CodeGen/AMDGPU/loop_break.ll =================================================================== --- test/CodeGen/AMDGPU/loop_break.ll +++ test/CodeGen/AMDGPU/loop_break.ll @@ -41,7 +41,7 @@ ; GCN: s_andn2_b64 exec, exec, [[MASK]] ; GCN-NEXT: s_cbranch_execnz [[LOOP_ENTRY]] -; GCN: ; BB#4: ; %bb9 +; GCN-NEXT: BB0_4: ; %bb9 ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]] ; GCN-NEXT: s_endpgm define void @break_loop(i32 %arg) #0 { Index: test/CodeGen/AMDGPU/valu-i1.ll =================================================================== --- test/CodeGen/AMDGPU/valu-i1.ll +++ test/CodeGen/AMDGPU/valu-i1.ll @@ -167,7 +167,7 @@ ; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]] ; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]] -; SI: BB#5 +; SI: BB{{[0-9]+_[0-9]+}}: ; %Flow8 ; SI: s_or_b64 exec, exec, [[COND_STATE]] ; SI: [[LABEL_EXIT]]: