Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -811,8 +811,8 @@ bool GCNPassConfig::addInstSelector() { AMDGPUPassConfig::addInstSelector(); - addPass(createSILowerI1CopiesPass()); addPass(&SIFixSGPRCopiesID); + addPass(createSILowerI1CopiesPass()); return false; } Index: lib/Target/AMDGPU/SIFixSGPRCopies.cpp =================================================================== --- lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -183,13 +183,15 @@ static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC, const TargetRegisterClass *DstRC, const SIRegisterInfo &TRI) { - return TRI.isSGPRClass(DstRC) && TRI.hasVGPRs(SrcRC); + return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(DstRC) && + TRI.hasVGPRs(SrcRC); } static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, const TargetRegisterClass *DstRC, const SIRegisterInfo &TRI) { - return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC); + return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(SrcRC) && + TRI.hasVGPRs(DstRC); } static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI, Index: lib/Target/AMDGPU/SILowerI1Copies.cpp =================================================================== --- lib/Target/AMDGPU/SILowerI1Copies.cpp +++ lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -5,37 +5,60 @@ // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // -/// i1 values are usually inserted by the CFG Structurize pass and they are -/// unique in that they can be copied from VALU to SALU registers. -/// This is not possible for any other value type. Since there are no -/// MOV instructions for i1, we to use V_CMP_* and V_CNDMASK to move the i1. -/// //===----------------------------------------------------------------------===// // +// This pass lowers all occurrences of i1 values (with a vreg_1 register class) +// to lane masks (64-bit scalar registers). The pass assumes machine SSA form +// and a wave-level control flow graph. +// +// Before this pass, values that are semantically i1 and are defined and used +// within the same basic block are already represented as lane masks in scalar +// registers. However, values that cross basic blocks are always transferred +// between basic blocks in vreg_1 virtual registers and are lowered by this +// pass. +// +// The only instructions that use or define vreg_1 virtual registers are COPY, +// PHI, and IMPLICIT_DEF. +// +//===----------------------------------------------------------------------===// -#define DEBUG_TYPE "si-i1-copies" #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "Utils/AMDGPULaneDominator.h" -#include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineSSAUpdater.h" #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" #include "llvm/Support/Debug.h" #include "llvm/Target/TargetMachine.h" +#define DEBUG_TYPE "si-i1-copies" + using namespace llvm; +static unsigned createLaneMaskReg(MachineFunction &MF); +static unsigned insertUndefLaneMask(MachineBasicBlock &MBB); + namespace { class SILowerI1Copies : public MachineFunctionPass { public: static char ID; +private: + MachineFunction *MF = nullptr; + MachineDominatorTree *DT = nullptr; + MachinePostDominatorTree *PDT = nullptr; + MachineRegisterInfo *MRI = nullptr; + const SIInstrInfo *TII = nullptr; + + DenseSet ConstrainRegs; + public: SILowerI1Copies() : MachineFunctionPass(ID) { initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry()); @@ -47,13 +70,341 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } + +private: + void lowerCopiesFromI1(); + void lowerPhis(); + void lowerCopiesToI1(); + bool isConstantLaneMask(unsigned Reg, bool &Val) const; + void buildMergeLaneMasks(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, const DebugLoc &DL, + unsigned DstReg, unsigned PrevReg, unsigned CurReg); + + bool isLaneMaskReg(unsigned Reg) const { + if (Reg == AMDGPU::VCC || Reg == AMDGPU::EXEC) + return true; + + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + return false; + + const TargetRegisterClass *RC = MRI->getRegClass(Reg); + return RC == &AMDGPU::SReg_64RegClass || + RC == &AMDGPU::SReg_64_XEXECRegClass; + } +}; + +/// Helper class that determines the relationship between incoming values of a +/// phi in the control flow graph to determine where an incoming value can +/// simply be taken as a scalar lane mask as-is, and where it needs to be +/// merged with another, previously defined lane mask. +/// +/// The approach is as follows: +/// - Determine all basic blocks which, starting from the incoming blocks, +/// a wave may reach before entering the def block (the block containing the +/// phi). +/// - If an incoming block has no predecessors in this set, we can take the +/// incoming value as a scalar lane mask as-is. +/// -- A special case of this is when the def block has a self-loop. +/// - Otherwise, the incoming value needs to be merged with a previously +/// defined lane mask. +/// - If there is a path into the set of reachable blocks that does _not_ go +/// through an incoming block where we can take the scalar lane mask as-is, +/// we need to invent an available value for the SSAUpdater. Choices are +/// 0 and undef, with differing consequences for how to merge values etc. +/// +/// TODO: We could use region analysis to quickly skip over SESE regions during +/// the traversal. +/// +class PhiIncomingAnalysis { + MachinePostDominatorTree &PDT; + + // For each reachable basic block, whether it is a source in the induced + // subgraph of the CFG. + DenseMap ReachableMap; + SmallVector ReachableOrdered; + SmallVector Stack; + SmallVector Predecessors; + +public: + PhiIncomingAnalysis(MachinePostDominatorTree &PDT) + : PDT(PDT) {} + + /// Returns whether \p MBB is a source in the induced subgraph of reachable + /// blocks. + bool isSource(MachineBasicBlock &MBB) const { + return ReachableMap.find(&MBB)->second; + } + + ArrayRef predecessors() const { return Predecessors; } + + void analyze(MachineBasicBlock &DefBlock, + ArrayRef IncomingBlocks) { + assert(Stack.empty()); + ReachableMap.clear(); + ReachableOrdered.clear(); + Predecessors.clear(); + + // Insert the def block first, so that it acts as an end point for the + // traversal. + ReachableMap.try_emplace(&DefBlock, false); + ReachableOrdered.push_back(&DefBlock); + + for (MachineBasicBlock *MBB : IncomingBlocks) { + if (MBB == &DefBlock) { + ReachableMap[&DefBlock] = true; // self-loop on DefBlock + continue; + } + + ReachableMap.try_emplace(MBB, false); + ReachableOrdered.push_back(MBB); + + // If this block has a divergent terminator and the def block is its + // post-dominator, the wave may first visit the other successors. + bool Divergent = false; + for (MachineInstr &MI : MBB->terminators()) { + if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO || + MI.getOpcode() == AMDGPU::SI_IF || + MI.getOpcode() == AMDGPU::SI_ELSE || + MI.getOpcode() == AMDGPU::SI_LOOP) { + Divergent = true; + break; + } + } + + if (Divergent && PDT.dominates(&DefBlock, MBB)) { + for (MachineBasicBlock *Succ : MBB->successors()) + Stack.push_back(Succ); + } + } + + while (!Stack.empty()) { + MachineBasicBlock *MBB = Stack.pop_back_val(); + if (!ReachableMap.try_emplace(MBB, false).second) + continue; + ReachableOrdered.push_back(MBB); + + for (MachineBasicBlock *Succ : MBB->successors()) + Stack.push_back(Succ); + } + + for (MachineBasicBlock *MBB : ReachableOrdered) { + bool HaveReachablePred = false; + for (MachineBasicBlock *Pred : MBB->predecessors()) { + if (ReachableMap.count(Pred)) { + HaveReachablePred = true; + } else { + Stack.push_back(Pred); + } + } + if (!HaveReachablePred) + ReachableMap[MBB] = true; + if (HaveReachablePred) { + for (MachineBasicBlock *UnreachablePred : Stack) { + if (llvm::find(Predecessors, UnreachablePred) == Predecessors.end()) + Predecessors.push_back(UnreachablePred); + } + } + Stack.clear(); + } + } +}; + +/// Helper class that detects loops which require us to lower an i1 COPY into +/// bitwise manipulation. +/// +/// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish +/// between loops with the same header. Consider this example: +/// +/// A-+-+ +/// | | | +/// B-+ | +/// | | +/// C---+ +/// +/// A is the header of a loop containing A, B, and C as far as LoopInfo is +/// concerned. However, an i1 COPY in B that is used in C must be lowered to +/// bitwise operations to combine results from different loop iterations when +/// B has a divergent branch (since by default we will compile this code such +/// that threads in a wave are merged at the entry of C). +/// +/// The following rule is implemented to determine whether bitwise operations +/// are required: use the bitwise lowering for a def in block B if a backward +/// edge to B is reachable without going through the nearest common +/// post-dominator of B and all uses of the def. +/// +/// TODO: This rule is conservative because it does not check whether the +/// relevant branches are actually divergent. +/// +/// The class is designed to cache the CFG traversal so that it can be re-used +/// for multiple defs within the same basic block. +/// +/// TODO: We could use region analysis to quickly skip over SESE regions during +/// the traversal. +/// +class LoopFinder { + MachineDominatorTree &DT; + MachinePostDominatorTree &PDT; + + // All visited / reachable block, tagged by level (level 0 is the def block, + // level 1 are all blocks reachable including but not going through the def + // block's IPDOM, etc.). + DenseMap Visited; + + // Nearest common dominator of all visited blocks by level (level 0 is the + // def block). Used for seeding the SSAUpdater. + SmallVector CommonDominators; + + // Post-dominator of all visited blocks. + MachineBasicBlock *VisitedPostDom = nullptr; + + // Level at which a loop was found: 0 is not possible; 1 = a backward edge is + // reachable without going through the IPDOM of the def block (if the IPDOM + // itself has an edge to the def block, the loop level is 2), etc. + unsigned FoundLoopLevel = ~0u; + + MachineBasicBlock *DefBlock = nullptr; + SmallVector Stack; + SmallVector NextLevel; + +public: + LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT) + : DT(DT), PDT(PDT) {} + + void initialize(MachineBasicBlock &MBB) { + Visited.clear(); + CommonDominators.clear(); + Stack.clear(); + NextLevel.clear(); + VisitedPostDom = nullptr; + FoundLoopLevel = ~0u; + + DefBlock = &MBB; + } + + /// Check whether a backward edge can be reached without going through the + /// given \p PostDom of the def block. + /// + /// Return the level of \p PostDom if a loop was found, or 0 otherwise. + unsigned findLoop(MachineBasicBlock *PostDom) { + MachineDomTreeNode *PDNode = PDT.getNode(DefBlock); + + if (!VisitedPostDom) + advanceLevel(); + + unsigned Level = 0; + while (PDNode->getBlock() != PostDom) { + if (PDNode->getBlock() == VisitedPostDom) + advanceLevel(); + PDNode = PDNode->getIDom(); + Level++; + if (FoundLoopLevel == Level) + return Level; + } + + return 0; + } + + /// Add undef values dominating the loop and the optionally given additional + /// blocks, so that the SSA updater doesn't have to search all the way to the + /// function entry. + void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater, + ArrayRef Blocks = {}) { + assert(LoopLevel < CommonDominators.size()); + + MachineBasicBlock *Dom = CommonDominators[LoopLevel]; + for (MachineBasicBlock *MBB : Blocks) + Dom = DT.findNearestCommonDominator(Dom, MBB); + + if (!inLoopLevel(*Dom, LoopLevel, Blocks)) { + SSAUpdater.AddAvailableValue(Dom, insertUndefLaneMask(*Dom)); + } else { + // The dominator is part of the loop or the given blocks, so add the + // undef value to unreachable predecessors instead. + for (MachineBasicBlock *Pred : Dom->predecessors()) { + if (!inLoopLevel(*Pred, LoopLevel, Blocks)) + SSAUpdater.AddAvailableValue(Pred, insertUndefLaneMask(*Pred)); + } + } + } + +private: + bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel, + ArrayRef Blocks) const { + auto DomIt = Visited.find(&MBB); + if (DomIt != Visited.end() && DomIt->second <= LoopLevel) + return true; + + if (llvm::find(Blocks, &MBB) != Blocks.end()) + return true; + + return false; + } + + void advanceLevel() { + MachineBasicBlock *VisitedDom; + + if (!VisitedPostDom) { + VisitedPostDom = DefBlock; + VisitedDom = DefBlock; + Stack.push_back(DefBlock); + } else { + VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock(); + VisitedDom = CommonDominators.back(); + + for (unsigned i = 0; i < NextLevel.size();) { + if (PDT.dominates(VisitedPostDom, NextLevel[i])) { + Stack.push_back(NextLevel[i]); + + NextLevel[i] = NextLevel.back(); + NextLevel.pop_back(); + } else { + i++; + } + } + } + + unsigned Level = CommonDominators.size(); + while (!Stack.empty()) { + MachineBasicBlock *MBB = Stack.pop_back_val(); + if (!PDT.dominates(VisitedPostDom, MBB)) + NextLevel.push_back(MBB); + + Visited[MBB] = Level; + VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB); + + for (MachineBasicBlock *Succ : MBB->successors()) { + if (Succ == DefBlock) { + if (MBB == VisitedPostDom) + FoundLoopLevel = std::min(FoundLoopLevel, Level + 1); + else + FoundLoopLevel = std::min(FoundLoopLevel, Level); + continue; + } + + if (Visited.try_emplace(Succ, ~0u).second) { + if (MBB == VisitedPostDom) + NextLevel.push_back(Succ); + else + Stack.push_back(Succ); + } + } + } + + CommonDominators.push_back(VisitedDom); + } }; } // End anonymous namespace. -INITIALIZE_PASS(SILowerI1Copies, DEBUG_TYPE, +INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, + "SI Lower i1 Copies", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false, false) char SILowerI1Copies::ID = 0; @@ -64,104 +415,376 @@ return new SILowerI1Copies(); } -bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) { +static unsigned createLaneMaskReg(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); + return MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); +} + +static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) { + MachineFunction &MF = *MBB.getParent(); const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); - const TargetRegisterInfo *TRI = &TII->getRegisterInfo(); + unsigned UndefReg = createLaneMaskReg(MF); + BuildMI(MBB, MBB.getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF), + UndefReg); + return UndefReg; +} - std::vector I1Defs; +/// Lower all instructions that def or use vreg_1 registers. +/// +/// In a first pass, we lower COPYs from vreg_1 to vector registers, as can +/// occur around inline assembly. We do this first, before vreg_1 registers +/// are changed to scalar mask registers. +/// +/// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before +/// all others, because phi lowering looks through copies and can therefore +/// often make copy lowering unnecessary. +bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) { + MF = &TheMF; + MRI = &MF->getRegInfo(); + DT = &getAnalysis(); + PDT = &getAnalysis(); - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { + const GCNSubtarget &ST = MF->getSubtarget(); + TII = ST.getInstrInfo(); - MachineBasicBlock &MBB = *BI; - MachineBasicBlock::iterator I, Next; - for (I = MBB.begin(); I != MBB.end(); I = Next) { - Next = std::next(I); - MachineInstr &MI = *I; + lowerCopiesFromI1(); + lowerPhis(); + lowerCopiesToI1(); - if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) { - unsigned Reg = MI.getOperand(0).getReg(); - const TargetRegisterClass *RC = MRI.getRegClass(Reg); - if (RC == &AMDGPU::VReg_1RegClass) - MRI.setRegClass(Reg, &AMDGPU::SReg_64RegClass); - continue; - } + for (unsigned Reg : ConstrainRegs) + MRI->constrainRegClass(Reg, &AMDGPU::SReg_64_XEXECRegClass); + ConstrainRegs.clear(); + + return true; +} +void SILowerI1Copies::lowerCopiesFromI1() { + SmallVector DeadCopies; + + for (MachineBasicBlock &MBB : *MF) { + for (MachineInstr &MI : MBB) { if (MI.getOpcode() != AMDGPU::COPY) continue; - const MachineOperand &Dst = MI.getOperand(0); - const MachineOperand &Src = MI.getOperand(1); - - if (!TargetRegisterInfo::isVirtualRegister(Src.getReg()) || - !TargetRegisterInfo::isVirtualRegister(Dst.getReg())) + unsigned DstReg = MI.getOperand(0).getReg(); + unsigned SrcReg = MI.getOperand(1).getReg(); + if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || + MRI->getRegClass(SrcReg) != &AMDGPU::VReg_1RegClass) continue; - const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg()); - const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg()); + if (isLaneMaskReg(DstReg) || + (TargetRegisterInfo::isVirtualRegister(DstReg) && + MRI->getRegClass(DstReg) == &AMDGPU::VReg_1RegClass)) + continue; + // Copy into a 32-bit vector register. + LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI); DebugLoc DL = MI.getDebugLoc(); - MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg()); - if (DstRC == &AMDGPU::VReg_1RegClass && - TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) { - I1Defs.push_back(Dst.getReg()); - - if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) { - if (DefInst->getOperand(1).isImm()) { - I1Defs.push_back(Dst.getReg()); - - int64_t Val = DefInst->getOperand(1).getImm(); - assert(Val == 0 || Val == -1); - - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32)) - .add(Dst) - .addImm(Val); - MI.eraseFromParent(); - continue; + + assert(TII->getRegisterInfo().getRegSizeInBits(DstReg, *MRI) == 32); + + ConstrainRegs.insert(SrcReg); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addImm(0) + .addImm(-1) + .addReg(SrcReg); + DeadCopies.push_back(&MI); + } + + for (MachineInstr *MI : DeadCopies) + MI->eraseFromParent(); + DeadCopies.clear(); + } +} + +void SILowerI1Copies::lowerPhis() { + MachineSSAUpdater SSAUpdater(*MF); + LoopFinder LF(*DT, *PDT); + PhiIncomingAnalysis PIA(*PDT); + SmallVector DeadPhis; + SmallVector IncomingBlocks; + SmallVector IncomingRegs; + SmallVector IncomingUpdated; + + for (MachineBasicBlock &MBB : *MF) { + LF.initialize(MBB); + + for (MachineInstr &MI : MBB.phis()) { + if (!MI.isPHI()) + continue; + + unsigned DstReg = MI.getOperand(0).getReg(); + if (MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass) + continue; + + LLVM_DEBUG(dbgs() << "Lower PHI: " << MI); + + MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass); + + // Collect incoming values. + for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { + assert(i + 1 < MI.getNumOperands()); + unsigned IncomingReg = MI.getOperand(i).getReg(); + MachineBasicBlock *IncomingMBB = MI.getOperand(i + 1).getMBB(); + MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg); + + if (IncomingDef->getOpcode() == AMDGPU::COPY) { + IncomingReg = IncomingDef->getOperand(1).getReg(); + assert(isLaneMaskReg(IncomingReg)); + } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) { + continue; + } else { + assert(IncomingDef->isPHI()); + } + + IncomingBlocks.push_back(IncomingMBB); + IncomingRegs.push_back(IncomingReg); + } + + // Phis in a loop that are observed outside the loop receive a simple but + // conservatively correct treatment. + MachineBasicBlock *PostDomBound = &MBB; + for (MachineInstr &Use : MRI->use_instructions(DstReg)) { + PostDomBound = + PDT->findNearestCommonDominator(PostDomBound, Use.getParent()); + } + + unsigned FoundLoopLevel = LF.findLoop(PostDomBound); + + SSAUpdater.Initialize(DstReg); + + if (FoundLoopLevel) { + LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks); + + for (unsigned i = 0; i < IncomingRegs.size(); ++i) { + IncomingUpdated.push_back(createLaneMaskReg(*MF)); + SSAUpdater.AddAvailableValue(IncomingBlocks[i], + IncomingUpdated.back()); + } + + for (unsigned i = 0; i < IncomingRegs.size(); ++i) { + MachineBasicBlock &IMBB = *IncomingBlocks[i]; + buildMergeLaneMasks(IMBB, IMBB.getFirstInstrTerminator(), {}, + IncomingUpdated[i], + SSAUpdater.GetValueInMiddleOfBlock(&IMBB), + IncomingRegs[i]); + } + } else { + // The phi is not observed from outside a loop. Use a more accurate + // lowering. + PIA.analyze(MBB, IncomingBlocks); + + for (MachineBasicBlock *MBB : PIA.predecessors()) + SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB)); + + for (unsigned i = 0; i < IncomingRegs.size(); ++i) { + MachineBasicBlock &IMBB = *IncomingBlocks[i]; + if (PIA.isSource(IMBB)) { + IncomingUpdated.push_back(0); + SSAUpdater.AddAvailableValue(&IMBB, IncomingRegs[i]); + } else { + IncomingUpdated.push_back(createLaneMaskReg(*MF)); + SSAUpdater.AddAvailableValue(&IMBB, IncomingUpdated.back()); } } - unsigned int TmpSrc = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::COPY), TmpSrc) - .add(Src); - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64)) - .add(Dst) - .addImm(0) - .addImm(-1) - .addReg(TmpSrc); - MI.eraseFromParent(); - } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) && - SrcRC == &AMDGPU::VReg_1RegClass) { - if (DefInst->getOpcode() == AMDGPU::V_CNDMASK_B32_e64 && - DefInst->getOperand(1).isImm() && DefInst->getOperand(2).isImm() && - DefInst->getOperand(1).getImm() == 0 && - DefInst->getOperand(2).getImm() != 0 && - DefInst->getOperand(3).isReg() && - TargetRegisterInfo::isVirtualRegister( - DefInst->getOperand(3).getReg()) && - TRI->getCommonSubClass( - MRI.getRegClass(DefInst->getOperand(3).getReg()), - &AMDGPU::SGPR_64RegClass) && - AMDGPU::laneDominates(DefInst->getParent(), &MBB)) { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64)) - .add(Dst) - .addReg(AMDGPU::EXEC) - .add(DefInst->getOperand(3)); - } else { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64)) - .add(Dst) - .add(Src) - .addImm(0); + for (unsigned i = 0; i < IncomingRegs.size(); ++i) { + if (!IncomingUpdated[i]) + continue; + + MachineBasicBlock &IMBB = *IncomingBlocks[i]; + buildMergeLaneMasks(IMBB, IMBB.getFirstInstrTerminator(), {}, + IncomingUpdated[i], + SSAUpdater.GetValueInMiddleOfBlock(&IMBB), + IncomingRegs[i]); } - MI.eraseFromParent(); + } + + unsigned NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB); + if (NewReg != DstReg) { + MRI->replaceRegWith(NewReg, DstReg); + + // Ensure that DstReg has a single def and mark the old PHI node for + // deletion. + MI.getOperand(0).setReg(NewReg); + DeadPhis.push_back(&MI); + } + + IncomingBlocks.clear(); + IncomingRegs.clear(); + IncomingUpdated.clear(); + } + + for (MachineInstr *MI : DeadPhis) + MI->eraseFromParent(); + DeadPhis.clear(); + } +} + +void SILowerI1Copies::lowerCopiesToI1() { + MachineSSAUpdater SSAUpdater(*MF); + LoopFinder LF(*DT, *PDT); + SmallVector DeadCopies; + + for (MachineBasicBlock &MBB : *MF) { + LF.initialize(MBB); + + for (MachineInstr &MI : MBB) { + if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF && + MI.getOpcode() != AMDGPU::COPY) + continue; + + unsigned DstReg = MI.getOperand(0).getReg(); + if (!TargetRegisterInfo::isVirtualRegister(DstReg) || + MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass) + continue; + + if (MRI->use_empty(DstReg)) { + DeadCopies.push_back(&MI); + continue; + } + + LLVM_DEBUG(dbgs() << "Lower Other: " << MI); + + MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass); + if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) + continue; + + DebugLoc DL = MI.getDebugLoc(); + unsigned SrcReg = MI.getOperand(1).getReg(); + + if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || + !isLaneMaskReg(SrcReg)) { + assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32); + unsigned TmpReg = createLaneMaskReg(*MF); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg) + .addReg(SrcReg) + .addImm(0); + MI.getOperand(1).setReg(TmpReg); + SrcReg = TmpReg; + } + + // Defs in a loop that are observed outside the loop must be transformed + // into appropriate bit manipulation. + MachineBasicBlock *PostDomBound = &MBB; + for (MachineInstr &Use : MRI->use_instructions(DstReg)) { + PostDomBound = + PDT->findNearestCommonDominator(PostDomBound, Use.getParent()); + } + + unsigned FoundLoopLevel = LF.findLoop(PostDomBound); + if (FoundLoopLevel) { + SSAUpdater.Initialize(DstReg); + SSAUpdater.AddAvailableValue(&MBB, DstReg); + LF.addLoopEntries(FoundLoopLevel, SSAUpdater); + + buildMergeLaneMasks(MBB, MI, DL, DstReg, + SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg); + DeadCopies.push_back(&MI); } } + + for (MachineInstr *MI : DeadCopies) + MI->eraseFromParent(); + DeadCopies.clear(); } +} + +bool SILowerI1Copies::isConstantLaneMask(unsigned Reg, bool &Val) const { + const MachineInstr *MI; + for (;;) { + MI = MRI->getUniqueVRegDef(Reg); + if (MI->getOpcode() != AMDGPU::COPY) + break; - for (unsigned Reg : I1Defs) - MRI.setRegClass(Reg, &AMDGPU::VGPR_32RegClass); + Reg = MI->getOperand(1).getReg(); + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + return false; + if (!isLaneMaskReg(Reg)) + return false; + } + + if (MI->getOpcode() != AMDGPU::S_MOV_B64) + return false; + + if (!MI->getOperand(1).isImm()) + return false; + + int64_t Imm = MI->getOperand(1).getImm(); + if (Imm == 0) { + Val = false; + return true; + } + if (Imm == -1) { + Val = true; + return true; + } return false; } + +void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, unsigned DstReg, + unsigned PrevReg, unsigned CurReg) { + bool PrevVal; + bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal); + bool CurVal; + bool CurConstant = isConstantLaneMask(CurReg, CurVal); + + if (PrevConstant && CurConstant) { + if (PrevVal == CurVal) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) + .addReg(CurReg); + } else if (CurVal) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) + .addReg(AMDGPU::EXEC); + } else { + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), DstReg) + .addReg(AMDGPU::EXEC) + .addImm(-1); + } + return; + } + + unsigned PrevMaskedReg = 0; + unsigned CurMaskedReg = 0; + if (!PrevConstant) { + if (CurConstant && CurVal) { + PrevMaskedReg = PrevReg; + } else { + PrevMaskedReg = createLaneMaskReg(*MF); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ANDN2_B64), PrevMaskedReg) + .addReg(PrevReg) + .addReg(AMDGPU::EXEC); + } + } + if (!CurConstant) { + // TODO: check whether CurReg is already masked by EXEC + if (PrevConstant && PrevVal) { + CurMaskedReg = CurReg; + } else { + CurMaskedReg = createLaneMaskReg(*MF); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_B64), CurMaskedReg) + .addReg(CurReg) + .addReg(AMDGPU::EXEC); + } + } + + if (PrevConstant && !PrevVal) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) + .addReg(CurMaskedReg); + } else if (CurConstant && !CurVal) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) + .addReg(PrevMaskedReg); + } else if (PrevConstant && PrevVal) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ORN2_B64), DstReg) + .addReg(CurMaskedReg) + .addReg(AMDGPU::EXEC); + } else { + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_OR_B64), DstReg) + .addReg(PrevMaskedReg) + .addReg(CurMaskedReg ? CurMaskedReg : (unsigned)AMDGPU::EXEC); + } +} Index: lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h +++ /dev/null @@ -1,24 +0,0 @@ -//===- AMDGPULaneDominator.h ------------------------------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H -#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H - -namespace llvm { - -class MachineBasicBlock; - -namespace AMDGPU { - -bool laneDominates(MachineBasicBlock *MBBA, MachineBasicBlock *MBBB); - -} // end namespace AMDGPU -} // end namespace llvm - -#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H Index: lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp +++ /dev/null @@ -1,75 +0,0 @@ -//===-- AMDGPULaneDominator.cpp - Determine Lane Dominators ---------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// MBB A lane-dominates MBB B if -// 1. A dominates B in the usual sense, i.e. every path from the entry to B -// goes through A, and -// 2. whenever B executes, every active lane during that execution of B was -// also active during the most recent execution of A. -// -// The simplest example where A dominates B but does not lane-dominate it is -// where A is a loop: -// -// | -// +--+ -// A | -// +--+ -// | -// B -// -// Unfortunately, the second condition is not fully captured by the control -// flow graph when it is unstructured (as may happen when branch conditions are -// uniform). -// -// The following replacement of the second condition is a conservative -// approximation. It is an equivalent condition when the CFG is fully -// structured: -// -// 2'. every cycle in the CFG that contains A also contains B. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPULaneDominator.h" - -#include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/CodeGen/MachineBasicBlock.h" - -namespace llvm { - -namespace AMDGPU { - -// Given machine basic blocks A and B where A dominates B, check whether -// A lane-dominates B. -// -// The check is conservative, i.e. there can be false-negatives. -bool laneDominates(MachineBasicBlock *A, MachineBasicBlock *B) { - // Check whether A is reachable from itself without going through B. - DenseSet Reachable; - SmallVector Stack; - - Stack.push_back(A); - do { - MachineBasicBlock *MBB = Stack.back(); - Stack.pop_back(); - - for (MachineBasicBlock *Succ : MBB->successors()) { - if (Succ == A) - return false; - if (Succ != B && Reachable.insert(Succ).second) - Stack.push_back(Succ); - } - } while (!Stack.empty()); - - return true; -} - -} // namespace AMDGPU - -} // namespace llvm Index: lib/Target/AMDGPU/Utils/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/Utils/CMakeLists.txt +++ lib/Target/AMDGPU/Utils/CMakeLists.txt @@ -2,5 +2,4 @@ AMDGPUBaseInfo.cpp AMDKernelCodeTUtils.cpp AMDGPUAsmUtils.cpp - AMDGPULaneDominator.cpp ) Index: test/CodeGen/AMDGPU/i1-copy-from-loop.ll =================================================================== --- test/CodeGen/AMDGPU/i1-copy-from-loop.ll +++ test/CodeGen/AMDGPU/i1-copy-from-loop.ll @@ -1,19 +1,25 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s ; SI-LABEL: {{^}}i1_copy_from_loop: ; -; Cannot use an SGPR mask to copy %cc out of the loop, since the mask would -; only contain the lanes that were active during the last loop iteration. -; ; SI: ; %for.body -; SI: v_cmp_gt_u32_e64 [[SREG:s\[[0-9]+:[0-9]+\]]], 4, -; SI: v_cndmask_b32_e64 [[VREG:v[0-9]+]], 0, -1, [[SREG]] -; SI-NEXT: s_cbranch_vccnz [[ENDIF:BB[0-9_]+]] -; SI: [[ENDIF]]: -; SI-NOT: [[VREG]] -; SI: ; %for.end -; SI: v_cmp_ne_u32_e32 vcc, 0, [[VREG]] +; SI: v_cmp_gt_u32_e64 [[CC_SREG:s\[[0-9]+:[0-9]+\]]], 4, +; SI-DAG: s_andn2_b64 [[CC_ACCUM:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM]], exec +; SI-DAG: s_and_b64 [[CC_MASK:s\[[0-9]+:[0-9]+\]]], [[CC_SREG]], exec +; SI: s_or_b64 [[CC_ACCUM]], [[CC_ACCUM]], [[CC_MASK]] + +; SI: ; %Flow1 +; SI: s_or_b64 [[CC_ACCUM]], [[CC_ACCUM]], exec + +; SI: ; %Flow +; SI-DAG: s_andn2_b64 [[LCSSA_ACCUM:s\[[0-9]+:[0-9]+\]]], [[LCSSA_ACCUM]], exec +; SI-DAG: s_and_b64 [[CC_MASK2:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM]], exec +; SI: s_or_b64 [[LCSSA_ACCUM]], [[LCSSA_ACCUM]], [[CC_MASK2]] + +; SI: ; %for.end +; SI: s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[LCSSA_ACCUM]] + define amdgpu_ps void @i1_copy_from_loop(<4 x i32> inreg %rsrc, i32 %tid) { entry: br label %for.body Index: test/CodeGen/AMDGPU/i1-copy-phi.ll =================================================================== --- test/CodeGen/AMDGPU/i1-copy-phi.ll +++ test/CodeGen/AMDGPU/i1-copy-phi.ll @@ -2,12 +2,16 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s ; SI-LABEL: {{^}}br_i1_phi: -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} -; SI: s_and_saveexec_b64 -; SI: v_mov_b32_e32 [[REG]], -1{{$}} -; SI: v_cmp_ne_u32_e32 vcc, 0, [[REG]] -; SI: s_and_saveexec_b64 -; SI: s_endpgm + +; SI: ; %bb +; SI: s_mov_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], 0 + +; SI: ; %bb2 +; SI: s_mov_b64 [[TMP]], exec + +; SI: ; %bb3 +; SI: s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[TMP]] + define amdgpu_kernel void @br_i1_phi(i32 %arg) { bb: %tidig = call i32 @llvm.amdgcn.workitem.id.x() Index: test/CodeGen/AMDGPU/inline-asm.ll =================================================================== --- test/CodeGen/AMDGPU/inline-asm.ll +++ test/CodeGen/AMDGPU/inline-asm.ll @@ -198,7 +198,8 @@ } ; CHECK-LABEL: {{^}}i1_imm_input_phys_vgpr: -; CHECK: v_mov_b32_e32 v0, -1{{$}} +; CHECK: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], -1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, [[MASK]] ; CHECK: ; use v0 define amdgpu_kernel void @i1_imm_input_phys_vgpr() { entry: @@ -212,10 +213,14 @@ ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, [[LOAD]] ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; CHECK: ; use v0 +; CHECK: v_cmp_ne_u32_e32 vcc, 0, v1 +; CHECK: v_cndmask_b32_e64 [[STORE:v[0-9]+]], 0, 1, vcc +; CHECK: {{buffer|flat}}_store_byte [[STORE]], define amdgpu_kernel void @i1_input_phys_vgpr() { entry: %val = load i1, i1 addrspace(1)* undef - call void asm sideeffect "; use $0 ", "{v0}"(i1 %val) + %cc = call i1 asm sideeffect "; use $1, def $0 ", "={v1}, {v0}"(i1 %val) + store i1 %cc, i1 addrspace(1)* undef ret void } Index: test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll @@ -1,5 +1,5 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,SI %s -; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; FIXME: Enable for VI. @@ -144,20 +144,24 @@ } ; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc: -; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}} -; SI: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc -; SI: buffer_load_dword [[LOAD:v[0-9]+]] -; SI: v_cmp_ne_u32_e32 vcc, 0, [[LOAD]] -; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc +; SI: ; %entry +; SI: v_cmp_eq_u32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, {{v[0-9]+}} +; SI: s_mov_b64 vcc, 0 +; SI: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[CMP]] +; SI: ; %bb +; SI: buffer_load_dword [[LOAD:v[0-9]+]], +; SI: v_cmp_ne_u32_e32 vcc, 0, [[LOAD]] +; SI: s_and_b64 vcc, vcc, exec + +; SI: ; %exit +; SI: s_or_b64 exec, exec, [[SAVE]] +; SI-NOT: vcc +; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: buffer_store_dword +; SI: s_endpgm -; SI: BB9_2: -; SI: s_or_b64 exec, exec, [[SAVE]] -; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} -; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; SI: buffer_store_dword -; SI: s_endpgm define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone Index: test/CodeGen/AMDGPU/loop_break.ll =================================================================== --- test/CodeGen/AMDGPU/loop_break.ll +++ test/CodeGen/AMDGPU/loop_break.ll @@ -22,23 +22,28 @@ ; OPT: bb9: ; OPT: call void @llvm.amdgcn.end.cf(i64 -; TODO: Can remove exec fixes in return block ; GCN-LABEL: {{^}}break_loop: -; GCN: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], 0{{$}} +; GCN: s_mov_b64 [[OUTER_MASK:s\[[0-9]+:[0-9]+\]]], 0{{$}} ; GCN: [[LOOP_ENTRY:BB[0-9]+_[0-9]+]]: ; %bb1 -; GCN: v_cmp_lt_i32_e32 vcc, -1 -; GCN: s_and_b64 vcc, exec, vcc -; GCN: s_cbranch_vccnz [[FLOW:BB[0-9]+_[0-9]+]] - -; GCN: ; %bb.2: ; %bb4 -; GCN: buffer_load_dword -; GCN: v_cmp_ge_i32_e32 vcc, - -; GCN: [[FLOW]]: -; GCN: s_or_b64 [[MASK]], vcc, [[MASK]] -; GCN: s_andn2_b64 exec, exec, [[MASK]] -; GCN-NEXT: s_cbranch_execnz [[LOOP_ENTRY]] +; GCN: v_cmp_lt_i32_e32 vcc, -1 +; GCN: s_and_b64 vcc, exec, vcc +; GCN: s_or_b64 [[INNER_MASK:s\[[0-9]+:[0-9]+\]]], [[INNER_MASK]], exec +; GCN: s_cbranch_vccnz [[FLOW:BB[0-9]+_[0-9]+]] + +; GCN: ; %bb4 +; GCN: buffer_load_dword +; GCN: v_cmp_ge_i32_e32 vcc, +; GCN: s_andn2_b64 [[INNER_MASK]], [[INNER_MASK]], exec +; GCN: s_and_b64 [[TMP0:s\[[0-9]+:[0-9]+\]]], vcc, exec +; GCN: s_or_b64 [[INNER_MASK]], [[INNER_MASK]], [[TMP0]] + +; GCN: [[FLOW]]: ; %Flow +; GCN: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[INNER_MASK]] +; GCN: s_or_b64 [[TMP1]], [[TMP1]], [[OUTER_MASK]] +; GCN: s_mov_b64 [[OUTER_MASK]], [[TMP1]] +; GCN: s_andn2_b64 exec, exec, [[TMP1]] +; GCN-NEXT: s_cbranch_execnz [[LOOP_ENTRY]] ; GCN: ; %bb.4: ; %bb9 ; GCN-NEXT: s_endpgm Index: test/CodeGen/AMDGPU/multi-divergent-exit-region.ll =================================================================== --- test/CodeGen/AMDGPU/multi-divergent-exit-region.ll +++ test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -59,31 +59,48 @@ ; GCN-LABEL: {{^}}multi_divergent_region_exit_ret_ret: -; GCN: v_cmp_lt_i32_e32 vcc, 1 -; GCN: s_and_saveexec_b64 -; GCN: s_xor_b64 +; GCN: s_mov_b64 [[EXIT1:s\[[0-9]+:[0-9]+\]]], 0 +; GCN: v_cmp_lt_i32_e32 vcc, 1, +; GCN: s_mov_b64 [[EXIT0:s\[[0-9]+:[0-9]+\]]], 0 +; GCN: s_and_saveexec_b64 +; GCN: s_xor_b64 + +; GCN: ; %LeafBlock1 +; GCN-NEXT: s_mov_b64 [[EXIT0]], exec +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2, +; GCN-NEXT: s_and_b64 [[EXIT1]], vcc, exec + +; GCN: ; %Flow +; GCN-NEXT: s_or_saveexec_b64 +; GCN-NEXT: s_xor_b64 ; FIXME: Why is this compare essentially repeated? -; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]] -; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc -; GCN: v_cmp_ne_u32_e32 vcc, 1, [[REG]] -; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc +; GCN: ; %LeafBlock +; GCN-DAG: v_cmp_eq_u32_e32 vcc, 1, +; GCN-DAG: v_cmp_ne_u32_e64 [[TMP1:s\[[0-9]+:[0-9]+\]]], 1, +; GCN-DAG: s_andn2_b64 [[EXIT0]], [[EXIT0]], exec +; GCN-DAG: s_andn2_b64 [[EXIT1]], [[EXIT1]], exec +; GCN-DAG: s_and_b64 [[TMP0:s\[[0-9]+:[0-9]+\]]], vcc, exec +; GCN-DAG: s_and_b64 [[TMP1]], [[TMP1]], exec +; GCN-DAG: s_or_b64 [[EXIT0]], [[EXIT0]], [[TMP0]] +; GCN-DAG: s_or_b64 [[EXIT1]], [[EXIT1]], [[TMP1]] ; GCN: ; %Flow4 -; GCN-NEXT: s_or_b64 exec, exec -; GCN: v_cmp_ne_u32_e32 vcc, 0 +; GCN-NEXT: s_or_b64 exec, exec, +; GCN-NEXT: s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[EXIT1]] +; GCN-NEXT: s_xor_b64 ; GCN: ; %exit1 -; GCN: ds_write_b32 +; GCN: ds_write_b32 +; GCN: s_andn2_b64 [[EXIT0]], [[EXIT0]], exec -; GCN: %Flow5 -; GCN-NEXT: s_or_b64 exec, exec -; GCN: v_cmp_ne_u32_e32 vcc, 0 -; GCN-NEXT: s_and_saveexec_b64 +; GCN: ; %Flow5 +; GCN-NEXT: s_or_b64 exec, exec, +; GCN-NEXT; s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[EXIT0]] ; GCN: ; %exit0 -; GCN: buffer_store_dword +; GCN: buffer_store_dword ; GCN: ; %UnifiedReturnBlock ; GCN-NEXT: s_endpgm Index: test/CodeGen/AMDGPU/multilevel-break.ll =================================================================== --- test/CodeGen/AMDGPU/multilevel-break.ll +++ test/CodeGen/AMDGPU/multilevel-break.ll @@ -21,34 +21,46 @@ ; GCN-LABEL: {{^}}multi_else_break: +; GCN: ; %main_body +; GCN: s_mov_b64 [[LEFT_OUTER:s\[[0-9]+:[0-9]+\]]], 0{{$}} + ; GCN: [[OUTER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP.outer{{$}} +; GCN: s_mov_b64 [[LEFT_INNER:s\[[0-9]+:[0-9]+\]]], 0{{$}} ; GCN: [[INNER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP{{$}} -; GCN: s_and_saveexec_b64 [[SAVE_BREAK:s\[[0-9]+:[0-9]+\]]], vcc - -; GCN: BB{{[0-9]+}}_{{[0-9]+}}: ; %Flow{{$}} -; GCN-NEXT: ; in Loop: Header=[[INNER_LOOP]] Depth=2 - -; Ensure extra or eliminated -; GCN-NEXT: s_or_b64 exec, exec, [[SAVE_BREAK]] -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} -; GCN-NEXT: s_or_b64 [[OR_BREAK:s\[[0-9]+:[0-9]+\]]], vcc, s{{\[[0-9]+:[0-9]+\]}} -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} -; GCN-NEXT: v_mov_b32_e32 -; GCN-NEXT: s_andn2_b64 exec, exec, [[OR_BREAK]] -; GCN-NEXT: s_cbranch_execnz [[INNER_LOOP]] - -; GCN: ; %bb.{{[0-9]+}}: ; %Flow2{{$}} -; GCN-NEXT: ; in Loop: Header=[[OUTER_LOOP]] Depth=1 - -; Ensure copy is eliminated -; GCN-NEXT: s_or_b64 exec, exec, [[OR_BREAK]] -; GCN-NEXT: s_and_b64 [[MASKED2_SAVE_BREAK:s\[[0-9]+:[0-9]+\]]], exec, vcc -; GCN-NEXT: s_or_b64 [[OUTER_OR_BREAK:s\[[0-9]+:[0-9]+\]]], [[MASKED2_SAVE_BREAK]], s{{\[[0-9]+:[0-9]+\]}} -; GCN-NEXT: s_mov_b64 -; GCN-NEXT: v_mov_b32_e32 -; GCN-NEXT: s_andn2_b64 exec, exec, [[OUTER_OR_BREAK]] -; GCN-NEXT: s_cbranch_execnz [[OUTER_LOOP]] +; GCN: s_or_b64 [[BREAK_OUTER:s\[[0-9]+:[0-9]+\]]], [[BREAK_OUTER]], exec +; GCN: s_or_b64 [[BREAK_INNER:s\[[0-9]+:[0-9]+\]]], [[BREAK_INNER]], exec +; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc + +; FIXME: duplicate comparison +; GCN: ; %ENDIF +; GCN-DAG: v_cmp_eq_u32_e32 vcc, +; GCN-DAG: v_cmp_ne_u32_e64 [[TMP51NEG:s\[[0-9]+:[0-9]+\]]], +; GCN-DAG: s_andn2_b64 [[BREAK_OUTER]], [[BREAK_OUTER]], exec +; GCN-DAG: s_andn2_b64 [[BREAK_INNER]], [[BREAK_INNER]], exec +; GCN-DAG: s_and_b64 [[TMP_EQ:s\[[0-9]+:[0-9]+\]]], vcc, exec +; GCN-DAG: s_and_b64 [[TMP_NE:s\[[0-9]+:[0-9]+\]]], [[TMP51NEG]], exec +; GCN-DAG: s_or_b64 [[BREAK_OUTER]], [[BREAK_OUTER]], [[TMP_EQ]] +; GCN-DAG: s_or_b64 [[BREAK_INNER]], [[BREAK_INNER]], [[TMP_NE]] + +; GCN: ; %Flow +; GCN: s_or_b64 exec, exec, [[SAVE_EXEC]] +; GCN: s_and_b64 [[TMP0:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK_INNER]] +; GCN: s_or_b64 [[TMP0]], [[TMP0]], [[LEFT_INNER]] +; GCN: s_mov_b64 [[LEFT_INNER]], [[TMP0]] +; GCN: s_andn2_b64 exec, exec, [[TMP0]] +; GCN: s_cbranch_execnz [[INNER_LOOP]] + +; GCN: ; %Flow2 +; GCN: s_or_b64 exec, exec, [[TMP0]] +; GCN: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK_OUTER]] +; GCN: s_or_b64 [[TMP1]], [[TMP1]], [[LEFT_OUTER]] +; GCN: s_mov_b64 [[LEFT_OUTER]], [[TMP1]] +; GCN: s_andn2_b64 exec, exec, [[TMP1]] +; GCN: s_cbranch_execnz [[OUTER_LOOP]] + +; GCN: ; %IF +; GCN-NEXT: s_endpgm define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) { main_body: br label %LOOP.outer @@ -78,12 +90,38 @@ ; OPT: llvm.amdgcn.end.cf ; GCN-LABEL: {{^}}multi_if_break_loop: -; GCN: s_mov_b64 [[BREAK_REG:s\[[0-9]+:[0-9]+\]]], 0{{$}} +; GCN: s_mov_b64 [[LEFT:s\[[0-9]+:[0-9]+\]]], 0{{$}} ; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: ; %bb1{{$}} +; GCN: s_mov_b64 [[OLD_LEFT:s\[[0-9]+:[0-9]+\]]], [[LEFT]] + +; GCN: ; %LeafBlock1 +; GCN: s_mov_b64 +; GCN: s_mov_b64 [[BREAK:s\[[0-9]+:[0-9]+\]]], -1{{$}} + +; GCN: ; %case1 +; GCN: buffer_load_dword [[LOAD2:v[0-9]+]], +; GCN: v_cmp_ge_i32_e32 vcc, {{v[0-9]+}}, [[LOAD2]] +; GCN: s_orn2_b64 [[BREAK]], vcc, exec + +; GCN: ; %Flow3 +; GCN: s_branch [[FLOW:BB[0-9]+_[0-9]+]] + +; GCN: s_mov_b64 [[BREAK]], -1{{$}} + +; GCN: [[FLOW]]: ; %Flow + +; GCN: ; %case0 +; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], +; GCN-DAG: s_andn2_b64 [[BREAK]], [[BREAK]], exec +; GCN-DAG: v_cmp_ge_i32_e32 vcc, {{v[0-9]+}}, [[LOAD1]] +; GCN-DAG: s_and_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], vcc, exec +; GCN: s_or_b64 [[BREAK]], [[BREAK]], [[TMP]] -; GCN: s_or_b64 [[BREAK_REG]], vcc, [[BREAK_REG]] -; GCN: s_andn2_b64 exec, exec, [[BREAK_REG]] +; GCN: ; %Flow4 +; GCN: s_and_b64 [[BREAK]], exec, [[BREAK]] +; GCN: s_or_b64 [[LEFT]], [[BREAK]], [[OLD_LEFT]] +; GCN: s_andn2_b64 exec, exec, [[LEFT]] ; GCN-NEXT: s_cbranch_execnz define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 { Index: test/CodeGen/AMDGPU/select-opt.ll =================================================================== --- test/CodeGen/AMDGPU/select-opt.ll +++ test/CodeGen/AMDGPU/select-opt.ll @@ -137,7 +137,6 @@ ; GCN: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 1.0 ; GCN: v_cmp_neq_f32_e32 vcc, 0, v{{[0-9]+}} ; GCN: v_cmp_eq_f32_e32 vcc, 0, v{{[0-9]+}} -; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} define amdgpu_kernel void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 { entry: Index: test/CodeGen/AMDGPU/sgpr-control-flow.ll =================================================================== --- test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -100,22 +100,22 @@ ret void } -; FIXME: Should write to different SGPR pairs instead of copying to -; VALU for i1 phi. - ; SI-LABEL: {{^}}sgpr_if_else_valu_cmp_phi_br: -; SI: buffer_load_dword [[AVAL:v[0-9]+]] -; SI: v_cmp_gt_i32_e32 [[CMP_IF:vcc]], 0, [[AVAL]] -; SI: v_cndmask_b32_e64 [[V_CMP:v[0-9]+]], 0, -1, [[CMP_IF]] - -; SI: BB{{[0-9]+}}_2: -; SI: buffer_load_dword [[AVAL:v[0-9]+]] -; SI: v_cmp_eq_u32_e32 [[CMP_ELSE:vcc]], 0, [[AVAL]] -; SI: v_cndmask_b32_e64 [[V_CMP]], 0, -1, [[CMP_ELSE]] - -; SI: v_cmp_ne_u32_e32 [[CMP_CMP:vcc]], 0, [[V_CMP]] -; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP_CMP]] -; SI: buffer_store_dword [[RESULT]] + +; SI: ; %else +; SI: buffer_load_dword [[AVAL:v[0-9]+]] +; SI: v_cmp_gt_i32_e64 [[PHI:s\[[0-9]+:[0-9]+\]]], 0, [[AVAL]] + +; SI: ; %if +; SI: buffer_load_dword [[AVAL:v[0-9]+]] +; SI: v_cmp_eq_u32_e32 [[CMP_ELSE:vcc]], 0, [[AVAL]] +; SI-DAG: s_andn2_b64 [[PHI]], [[PHI]], exec +; SI-DAG: s_and_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP_ELSE]], exec +; SI: s_or_b64 [[PHI]], [[PHI]], [[TMP]] + +; SI: ; %endif +; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[PHI]] +; SI: buffer_store_dword [[RESULT]], define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 Index: test/CodeGen/AMDGPU/si-annotate-cf.ll =================================================================== --- test/CodeGen/AMDGPU/si-annotate-cf.ll +++ test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -march=amdgcn -mcpu=verde -asm-verbose=0 -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -asm-verbose=0 -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s ; FUNC-LABEL: {{^}}break_inserted_outside_of_loop: @@ -27,18 +27,23 @@ ; FUNC-LABEL: {{^}}phi_cond_outside_loop: -; FIXME: This could be folded into the s_or_b64 instruction -; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0 -; SI: [[LOOP_LABEL:[A-Z0-9]+]] -; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} -; SI_IF_BREAK instruction: -; SI: s_or_b64 [[BREAK:s\[[0-9]+:[0-9]+\]]], vcc, [[ZERO]] +; SI: s_mov_b64 [[LEFT:s\[[0-9]+:[0-9]+\]]], 0 +; SI: s_mov_b64 [[PHI:s\[[0-9]+:[0-9]+\]]], 0 -; SI_LOOP instruction: -; SI: s_andn2_b64 exec, exec, [[BREAK]] -; SI: s_cbranch_execnz [[LOOP_LABEL]] -; SI: s_endpgm +; SI: ; %else +; SI: v_cmp_eq_u32_e64 [[TMP:s\[[0-9]+:[0-9]+\]]], +; SI: s_and_b64 [[PHI]], [[TMP]], exec + +; SI: ; %endif + +; SI: [[LOOP_LABEL:BB[0-9]+_[0-9]+]]: ; %loop +; SI: s_mov_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[LEFT]] +; SI: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[PHI]] +; SI: s_or_b64 [[LEFT]], [[TMP1]], [[TMP]] +; SI: s_andn2_b64 exec, exec, [[LEFT]] +; SI: s_cbranch_execnz [[LOOP_LABEL]] +; SI: s_endpgm define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) { entry: @@ -90,19 +95,21 @@ ; This broke the old AMDIL cfg structurizer ; FUNC-LABEL: {{^}}loop_land_info_assert: ; SI: v_cmp_lt_i32_e64 [[CMP4:s\[[0-9:]+\]]], s{{[0-9]+}}, 4{{$}} -; SI: s_and_b64 vcc, exec, [[CMP4]] -; SI-NEXT: s_cbranch_vccnz [[BR1:BB[0-9_]+]] -; SI-NEXT: s_branch [[BR2:BB[0-9_]+]] -; SI-NEXT: BB{{[0-9_]+}}: -; SI-NEXT: buffer_store_dword +; SI: s_and_b64 [[CMP4M:s\[[0-9]+:[0-9]+\]]], exec, [[CMP4]] +; SI: s_mov_b64 vcc, [[CMP4M]] +; SI-NEXT: s_cbranch_vccnz [[CONVEX_EXIT:BB[0-9_]+]] +; SI-NEXT: s_branch [[FOR_COND_PREHDR:BB[0-9_]+]] + +; SI: ; %if.else +; SI: buffer_store_dword ; SI: [[INFLOOP:BB[0-9]+_[0-9]+]]: -; SI: [[BR1]]: -; SI-NEXT: s_and_b64 vcc, exec, -; SI-NEXT: s_cbranch_vccnz [[ENDPGM:BB[0-9]+_[0-9]+]] +; SI: [[CONVEX_EXIT]]: +; SI: s_mov_b64 vcc, +; SI-NEXT: s_cbranch_vccnz [[ENDPGM:BB[0-9]+_[0-9]+]] ; SI: s_branch [[INFLOOP]] -; SI-NEXT: [[BR2]]: +; SI-NEXT: [[FOR_COND_PREHDR]]: ; SI: s_cbranch_vccz [[ENDPGM]] ; SI: [[ENDPGM]]: Index: test/CodeGen/AMDGPU/valu-i1.ll =================================================================== --- test/CodeGen/AMDGPU/valu-i1.ll +++ test/CodeGen/AMDGPU/valu-i1.ll @@ -8,23 +8,22 @@ ; waitcnt should be inserted after exec modification -; SI: v_cmp_lt_i32_e32 vcc, 0, -; SI: v_mov_b32_e32 {{v[0-9]+}}, 0 +; SI: v_cmp_lt_i32_e32 vcc, 0, +; SI-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0 +; SI-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0 ; SI-NEXT: s_and_saveexec_b64 [[SAVE1:s\[[0-9]+:[0-9]+\]]], vcc ; SI-NEXT: s_xor_b64 [[SAVE2:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE1]] ; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]] ; SI-NEXT: s_cbranch_execz [[FLOW_BB]] ; SI-NEXT: BB{{[0-9]+}}_1: ; %LeafBlock3 -; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1 -; SI: v_mov_b32_e32 v{{[0-9]}}, -1 -; SI: s_and_saveexec_b64 +; SI: s_mov_b64 s[{{[0-9]:[0-9]}}], -1 +; SI: s_and_saveexec_b64 ; SI-NEXT: ; mask branch ; v_mov should be after exec modification ; SI: [[FLOW_BB]]: ; SI-NEXT: s_or_saveexec_b64 [[SAVE3:s\[[0-9]+:[0-9]+\]]], [[SAVE2]] -; SI-NEXT: v_mov_b32_e32 v{{[0-9]+}} ; SI-NEXT: s_xor_b64 exec, exec, [[SAVE3]] ; SI-NEXT: ; mask branch ; @@ -220,9 +219,10 @@ ; SI: [[LABEL_FLOW]]: ; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]] ; SI-NEXT: s_or_b64 exec, exec, [[ORNEG2]] -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: s_or_b64 [[COND_STATE]], vcc, [[COND_STATE]] -; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]] +; SI-NEXT: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]], +; SI-NEXT: s_or_b64 [[TMP2:s\[[0-9]+:[0-9]+\]]], [[TMP1]], [[COND_STATE]] +; SI-NEXT: s_mov_b64 [[COND_STATE]], [[TMP2]] +; SI-NEXT: s_andn2_b64 exec, exec, [[TMP2]] ; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]] ; SI: [[LABEL_EXIT]]: Index: test/CodeGen/AMDGPU/waitcnt-looptest.ll =================================================================== --- test/CodeGen/AMDGPU/waitcnt-looptest.ll +++ test/CodeGen/AMDGPU/waitcnt-looptest.ll @@ -6,7 +6,7 @@ ; GCN-LABEL: {{^}}testKernel ; GCN: BB0_1: ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_f32_e64 +; GCN-NEXT: v_cmp_eq_f32_e32 ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cmp_eq_f32_e32 ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)