Index: llvm/lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -15,12 +15,15 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" +#include "Utils/AMDGPULoopHelpers.h" #define DEBUG_TYPE "si-fold-operands" using namespace llvm; @@ -91,6 +94,9 @@ const GCNSubtarget *ST; const SIMachineFunctionInfo *MFI; + MachineDominatorTree* MDT = nullptr; + MachinePostDominatorTree* MPDT = nullptr; + void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, int UseOpIdx, @@ -116,6 +122,8 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } }; @@ -417,7 +425,6 @@ const MachineInstr &MI, const MachineOperand &UseMO) { return !UseMO.isUndef() && !TII->isSDWA(MI); - //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg()); } static bool tryToFoldACImm(const SIInstrInfo *TII, @@ -1086,6 +1093,18 @@ Copy->addImplicitDefUseOperands(*MF); for (FoldCandidate &Fold : FoldList) { + if (Fold.isReg()) { + MachineInstr *DefMI = Fold.OpToFold->getParent(); + if (DefMI->readsRegister(AMDGPU::EXEC)) { + MachineInstr* UseMI = Fold.UseMI; + MachineBasicBlock* PDom = MPDT->findNearestCommonDominator( + DefMI->getParent(), UseMI->getParent()); + LoopFinder LF = LoopFinder(*MDT, *MPDT); + LF.initialize(*DefMI->getParent()); + if (LF.findLoop(PDom)) + continue; + } + } if (updateOperand(Fold, *TII, *TRI, *ST)) { // Clear kill flags. if (Fold.isReg()) { @@ -1320,6 +1339,9 @@ TRI = &TII->getRegisterInfo(); MFI = MF.getInfo(); + MDT = &getAnalysis(); + MPDT = &getAnalysis(); + // omod is ignored by hardware if IEEE bit is enabled. omod also does not // correctly handle signed zeros. // Index: llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -35,6 +35,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/Support/Debug.h" #include "llvm/Target/TargetMachine.h" +#include "Utils/AMDGPULoopHelpers.h" #define DEBUG_TYPE "si-i1-copies" @@ -105,6 +106,12 @@ TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) == ST->getWavefrontSize(); } + /// Add undef values dominating the loop and the optionally given additional + /// blocks, so that the SSA updater doesn't have to search all the way to the + /// function entry. + void addLoopEntries(LoopFinder &LF, unsigned LoopLevel, + MachineSSAUpdater &SSAUpdater, + ArrayRef Blocks = {}); }; /// Helper class that determines the relationship between incoming values of a @@ -222,190 +229,7 @@ } }; -/// Helper class that detects loops which require us to lower an i1 COPY into -/// bitwise manipulation. -/// -/// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish -/// between loops with the same header. Consider this example: -/// -/// A-+-+ -/// | | | -/// B-+ | -/// | | -/// C---+ -/// -/// A is the header of a loop containing A, B, and C as far as LoopInfo is -/// concerned. However, an i1 COPY in B that is used in C must be lowered to -/// bitwise operations to combine results from different loop iterations when -/// B has a divergent branch (since by default we will compile this code such -/// that threads in a wave are merged at the entry of C). -/// -/// The following rule is implemented to determine whether bitwise operations -/// are required: use the bitwise lowering for a def in block B if a backward -/// edge to B is reachable without going through the nearest common -/// post-dominator of B and all uses of the def. -/// -/// TODO: This rule is conservative because it does not check whether the -/// relevant branches are actually divergent. -/// -/// The class is designed to cache the CFG traversal so that it can be re-used -/// for multiple defs within the same basic block. -/// -/// TODO: We could use region analysis to quickly skip over SESE regions during -/// the traversal. -/// -class LoopFinder { - MachineDominatorTree &DT; - MachinePostDominatorTree &PDT; - - // All visited / reachable block, tagged by level (level 0 is the def block, - // level 1 are all blocks reachable including but not going through the def - // block's IPDOM, etc.). - DenseMap Visited; - - // Nearest common dominator of all visited blocks by level (level 0 is the - // def block). Used for seeding the SSAUpdater. - SmallVector CommonDominators; - - // Post-dominator of all visited blocks. - MachineBasicBlock *VisitedPostDom = nullptr; - - // Level at which a loop was found: 0 is not possible; 1 = a backward edge is - // reachable without going through the IPDOM of the def block (if the IPDOM - // itself has an edge to the def block, the loop level is 2), etc. - unsigned FoundLoopLevel = ~0u; - MachineBasicBlock *DefBlock = nullptr; - SmallVector Stack; - SmallVector NextLevel; - -public: - LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT) - : DT(DT), PDT(PDT) {} - - void initialize(MachineBasicBlock &MBB) { - Visited.clear(); - CommonDominators.clear(); - Stack.clear(); - NextLevel.clear(); - VisitedPostDom = nullptr; - FoundLoopLevel = ~0u; - - DefBlock = &MBB; - } - - /// Check whether a backward edge can be reached without going through the - /// given \p PostDom of the def block. - /// - /// Return the level of \p PostDom if a loop was found, or 0 otherwise. - unsigned findLoop(MachineBasicBlock *PostDom) { - MachineDomTreeNode *PDNode = PDT.getNode(DefBlock); - - if (!VisitedPostDom) - advanceLevel(); - - unsigned Level = 0; - while (PDNode->getBlock() != PostDom) { - if (PDNode->getBlock() == VisitedPostDom) - advanceLevel(); - PDNode = PDNode->getIDom(); - Level++; - if (FoundLoopLevel == Level) - return Level; - } - - return 0; - } - - /// Add undef values dominating the loop and the optionally given additional - /// blocks, so that the SSA updater doesn't have to search all the way to the - /// function entry. - void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater, - ArrayRef Blocks = {}) { - assert(LoopLevel < CommonDominators.size()); - - MachineBasicBlock *Dom = CommonDominators[LoopLevel]; - for (MachineBasicBlock *MBB : Blocks) - Dom = DT.findNearestCommonDominator(Dom, MBB); - - if (!inLoopLevel(*Dom, LoopLevel, Blocks)) { - SSAUpdater.AddAvailableValue(Dom, insertUndefLaneMask(*Dom)); - } else { - // The dominator is part of the loop or the given blocks, so add the - // undef value to unreachable predecessors instead. - for (MachineBasicBlock *Pred : Dom->predecessors()) { - if (!inLoopLevel(*Pred, LoopLevel, Blocks)) - SSAUpdater.AddAvailableValue(Pred, insertUndefLaneMask(*Pred)); - } - } - } - -private: - bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel, - ArrayRef Blocks) const { - auto DomIt = Visited.find(&MBB); - if (DomIt != Visited.end() && DomIt->second <= LoopLevel) - return true; - - if (llvm::find(Blocks, &MBB) != Blocks.end()) - return true; - - return false; - } - - void advanceLevel() { - MachineBasicBlock *VisitedDom; - - if (!VisitedPostDom) { - VisitedPostDom = DefBlock; - VisitedDom = DefBlock; - Stack.push_back(DefBlock); - } else { - VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock(); - VisitedDom = CommonDominators.back(); - - for (unsigned i = 0; i < NextLevel.size();) { - if (PDT.dominates(VisitedPostDom, NextLevel[i])) { - Stack.push_back(NextLevel[i]); - - NextLevel[i] = NextLevel.back(); - NextLevel.pop_back(); - } else { - i++; - } - } - } - - unsigned Level = CommonDominators.size(); - while (!Stack.empty()) { - MachineBasicBlock *MBB = Stack.pop_back_val(); - if (!PDT.dominates(VisitedPostDom, MBB)) - NextLevel.push_back(MBB); - - Visited[MBB] = Level; - VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB); - - for (MachineBasicBlock *Succ : MBB->successors()) { - if (Succ == DefBlock) { - if (MBB == VisitedPostDom) - FoundLoopLevel = std::min(FoundLoopLevel, Level + 1); - else - FoundLoopLevel = std::min(FoundLoopLevel, Level); - continue; - } - - if (Visited.try_emplace(Succ, ~0u).second) { - if (MBB == VisitedPostDom) - NextLevel.push_back(Succ); - else - Stack.push_back(Succ); - } - } - } - - CommonDominators.push_back(VisitedDom); - } -}; } // End anonymous namespace. @@ -600,7 +424,7 @@ SSAUpdater.Initialize(DstReg); if (FoundLoopLevel) { - LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks); + addLoopEntries(LF, FoundLoopLevel, SSAUpdater, IncomingBlocks); for (unsigned i = 0; i < IncomingRegs.size(); ++i) { IncomingUpdated.push_back(createLaneMaskReg(*MF)); @@ -721,7 +545,7 @@ if (FoundLoopLevel) { SSAUpdater.Initialize(DstReg); SSAUpdater.AddAvailableValue(&MBB, DstReg); - LF.addLoopEntries(FoundLoopLevel, SSAUpdater); + addLoopEntries(LF, FoundLoopLevel, SSAUpdater); buildMergeLaneMasks(MBB, MI, DL, DstReg, SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg); @@ -873,3 +697,26 @@ .addReg(CurMaskedReg ? CurMaskedReg : ExecReg); } } + +/// Add undef values dominating the loop and the optionally given additional + /// blocks, so that the SSA updater doesn't have to search all the way to the + /// function entry. +void SILowerI1Copies::addLoopEntries(LoopFinder& LF, + unsigned LoopLevel, MachineSSAUpdater &SSAUpdater, + ArrayRef Blocks) { + + MachineBasicBlock *Dom = LF.getCommonDominator(LoopLevel); + for (MachineBasicBlock *MBB : Blocks) + Dom = DT->findNearestCommonDominator(Dom, MBB); + + if (!LF.inLoopLevel(*Dom, LoopLevel, Blocks)) { + SSAUpdater.AddAvailableValue(Dom, insertUndefLaneMask(*Dom)); + } else { + // The dominator is part of the loop or the given blocks, so add the + // undef value to unreachable predecessors instead. + for (MachineBasicBlock *Pred : Dom->predecessors()) { + if (!LF.inLoopLevel(*Pred, LoopLevel, Blocks)) + SSAUpdater.AddAvailableValue(Pred, insertUndefLaneMask(*Pred)); + } + } +} \ No newline at end of file Index: llvm/lib/Target/AMDGPU/Utils/AMDGPULoopHelpers.h =================================================================== --- /dev/null +++ llvm/lib/Target/AMDGPU/Utils/AMDGPULoopHelpers.h @@ -0,0 +1,105 @@ +/// Helper class that detects loops which require us to lower an i1 COPY into +/// bitwise manipulation. +/// +/// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish +/// between loops with the same header. Consider this example: +/// +/// A-+-+ +/// | | | +/// B-+ | +/// | | +/// C---+ +/// +/// A is the header of a loop containing A, B, and C as far as LoopInfo is +/// concerned. However, an i1 COPY in B that is used in C must be lowered to +/// bitwise operations to combine results from different loop iterations when +/// B has a divergent branch (since by default we will compile this code such +/// that threads in a wave are merged at the entry of C). +/// +/// The following rule is implemented to determine whether bitwise operations +/// are required: use the bitwise lowering for a def in block B if a backward +/// edge to B is reachable without going through the nearest common +/// post-dominator of B and all uses of the def. +/// +/// TODO: This rule is conservative because it does not check whether the +/// relevant branches are actually divergent. +/// +/// The class is designed to cache the CFG traversal so that it can be re-used +/// for multiple defs within the same basic block. +/// +/// TODO: We could use region analysis to quickly skip over SESE regions during +/// the traversal. +/// +#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULOOPHELPERS_H +#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULOOPHELPERS_H + +#include "AMDGPU.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/MachineSSAUpdater.h" + +namespace llvm { + +class LoopFinder { + MachineDominatorTree &DT; + MachinePostDominatorTree &PDT; + + // All visited / reachable block, tagged by level (level 0 is the def block, + // level 1 are all blocks reachable including but not going through the def + // block's IPDOM, etc.). + DenseMap Visited; + + // Nearest common dominator of all visited blocks by level (level 0 is the + // def block). Used for seeding the SSAUpdater. + SmallVector CommonDominators; + + // Post-dominator of all visited blocks. + MachineBasicBlock *VisitedPostDom = nullptr; + + // Level at which a loop was found: 0 is not possible; 1 = a backward edge is + // reachable without going through the IPDOM of the def block (if the IPDOM + // itself has an edge to the def block, the loop level is 2), etc. + unsigned FoundLoopLevel = ~0u; + + MachineBasicBlock *DefBlock = nullptr; + SmallVector Stack; + SmallVector NextLevel; + +public: + LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT) + : DT(DT), PDT(PDT) {} + + void initialize(MachineBasicBlock &MBB) { + Visited.clear(); + CommonDominators.clear(); + Stack.clear(); + NextLevel.clear(); + VisitedPostDom = nullptr; + FoundLoopLevel = ~0u; + + DefBlock = &MBB; + } + + /// Check whether a backward edge can be reached without going through the + /// given \p PostDom of the def block. + /// + /// Return the level of \p PostDom if a loop was found, or 0 otherwise. + unsigned findLoop(MachineBasicBlock *PostDom); + + MachineBasicBlock *getCommonDominator(unsigned LoopLevel) { + assert(LoopLevel < CommonDominators.size()); + + return CommonDominators[LoopLevel]; + } + + bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel, + ArrayRef Blocks) const; + +private: + void advanceLevel(); +}; + +} // namespace llvm +#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULOOPHELPERS_H Index: llvm/lib/Target/AMDGPU/Utils/AMDGPULoopHelpers.cpp =================================================================== --- /dev/null +++ llvm/lib/Target/AMDGPU/Utils/AMDGPULoopHelpers.cpp @@ -0,0 +1,118 @@ +/// Helper class that detects loops which require us to lower an i1 COPY into +/// bitwise manipulation. +/// +/// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish +/// between loops with the same header. Consider this example: +/// +/// A-+-+ +/// | | | +/// B-+ | +/// | | +/// C---+ +/// +/// A is the header of a loop containing A, B, and C as far as LoopInfo is +/// concerned. However, an i1 COPY in B that is used in C must be lowered to +/// bitwise operations to combine results from different loop iterations when +/// B has a divergent branch (since by default we will compile this code such +/// that threads in a wave are merged at the entry of C). +/// +/// The following rule is implemented to determine whether bitwise operations +/// are required: use the bitwise lowering for a def in block B if a backward +/// edge to B is reachable without going through the nearest common +/// post-dominator of B and all uses of the def. +/// +/// TODO: This rule is conservative because it does not check whether the +/// relevant branches are actually divergent. +/// +/// The class is designed to cache the CFG traversal so that it can be re-used +/// for multiple defs within the same basic block. +/// +/// TODO: We could use region analysis to quickly skip over SESE regions during +/// the traversal. +#include "AMDGPULoopHelpers.h" + +using namespace llvm; + +unsigned LoopFinder::findLoop(MachineBasicBlock *PostDom) { + MachineDomTreeNode *PDNode = PDT.getNode(DefBlock); + + if (!VisitedPostDom) + advanceLevel(); + + unsigned Level = 0; + while (PDNode->getBlock() != PostDom) { + if (PDNode->getBlock() == VisitedPostDom) + advanceLevel(); + PDNode = PDNode->getIDom(); + Level++; + if (FoundLoopLevel == Level) + return Level; + } + + return 0; +} + +bool LoopFinder::inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel, + ArrayRef Blocks) const { + auto DomIt = Visited.find(&MBB); + if (DomIt != Visited.end() && DomIt->second <= LoopLevel) + return true; + + if (llvm::find(Blocks, &MBB) != Blocks.end()) + return true; + + return false; +} + +void LoopFinder::advanceLevel() { + MachineBasicBlock *VisitedDom; + + if (!VisitedPostDom) { + VisitedPostDom = DefBlock; + VisitedDom = DefBlock; + Stack.push_back(DefBlock); + } else { + VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock(); + VisitedDom = CommonDominators.back(); + + for (unsigned i = 0; i < NextLevel.size();) { + if (PDT.dominates(VisitedPostDom, NextLevel[i])) { + Stack.push_back(NextLevel[i]); + + NextLevel[i] = NextLevel.back(); + NextLevel.pop_back(); + } else { + i++; + } + } + } + + unsigned Level = CommonDominators.size(); + while (!Stack.empty()) { + MachineBasicBlock *MBB = Stack.pop_back_val(); + if (!PDT.dominates(VisitedPostDom, MBB)) + NextLevel.push_back(MBB); + + Visited[MBB] = Level; + VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB); + + for (MachineBasicBlock *Succ : MBB->successors()) { + if (Succ == DefBlock) { + if (MBB == VisitedPostDom) + FoundLoopLevel = std::min(FoundLoopLevel, Level + 1); + else + FoundLoopLevel = std::min(FoundLoopLevel, Level); + continue; + } + + if (Visited.try_emplace(Succ, ~0u).second) { + if (MBB == VisitedPostDom) + NextLevel.push_back(Succ); + else + Stack.push_back(Succ); + } + } + } + + CommonDominators.push_back(VisitedDom); +} Index: llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt =================================================================== --- llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt +++ llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt @@ -3,4 +3,5 @@ AMDKernelCodeTUtils.cpp AMDGPUAsmUtils.cpp AMDGPUPALMetadata.cpp + AMDGPULoopHelpers.cpp )