Index: llvm/trunk/lib/Target/AMDGPU/SIFixWWMLiveness.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIFixWWMLiveness.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIFixWWMLiveness.cpp @@ -10,7 +10,7 @@ /// \file /// Computations in WWM can overwrite values in inactive channels for /// variables that the register allocator thinks are dead. This pass adds fake -/// uses of those variables to WWM instructions to make sure that they aren't +/// uses of those variables to their def(s) to make sure that they aren't /// overwritten. /// /// As an example, consider this snippet: @@ -29,25 +29,44 @@ /// second write to %vgpr0 anyways. But if %vgpr1 is written with WWM enabled, /// it would clobber even the inactive channels for which the if-condition is /// false, for which %vgpr0 is supposed to be 0. This pass adds an implicit use -/// of %vgpr0 to the WWM instruction to make sure they aren't allocated to the +/// of %vgpr0 to its def to make sure they aren't allocated to the /// same register. /// /// In general, we need to figure out what registers might have their inactive /// channels which are eventually used accidentally clobbered by a WWM -/// instruction. We approximate this using two conditions: +/// instruction. We do that by spotting three separate cases of registers: /// -/// 1. A definition of the variable reaches the WWM instruction. -/// 2. The variable would be live at the WWM instruction if all its defs were -/// partial defs (i.e. considered as a use), ignoring normal uses. -/// -/// If a register matches both conditions, then we add an implicit use of it to -/// the WWM instruction. Condition #2 is the heart of the matter: every -/// definition is really a partial definition, since every VALU instruction is -/// implicitly predicated. We can usually ignore this, but WWM forces us not -/// to. Condition #1 prevents false positives if the variable is undefined at -/// the WWM instruction anyways. This is overly conservative in certain cases, -/// especially in uniform control flow, but this is a workaround anyways until -/// LLVM gains the notion of predicated uses and definitions of variables. +/// 1. A "then phi": the value resulting from phi elimination of a phi node at +/// the end of an if..endif. If there is WWM code in the "then", then we +/// make the def at the end of the "then" branch a partial def by adding an +/// implicit use of the register. +/// +/// 2. A "loop exit register": a value written inside a loop but used outside the +/// loop, where there is WWM code inside the loop (the case in the example +/// above). We add an implicit_def of the register in the loop pre-header, +/// and make the original def a partial def by adding an implicit use of the +/// register. +/// +/// 3. A "loop exit phi": the value resulting from phi elimination of a phi node +/// in a loop header. If there is WWM code inside the loop, then we make all +/// defs inside the loop partial defs by adding an implicit use of the +/// register on each one. +/// +/// Note that we do not need to consider an if..else..endif phi. We only need to +/// consider non-uniform control flow, and control flow structurization would +/// have transformed a non-uniform if..else..endif into two if..endifs. +/// +/// The analysis to detect these cases relies on a property of the MIR +/// arising from this pass running straight after PHIElimination and before any +/// coalescing: that any virtual register with more than one definition must be +/// the new register added to lower a phi node by PHIElimination. +/// +/// FIXME: We should detect whether a register in one of the above categories is +/// already live at the WWM code before deciding to add the implicit uses to +/// synthesize its liveness. +/// +/// FIXME: I believe this whole scheme may be flawed due to the possibility of +/// the register allocator doing live interval splitting. /// //===----------------------------------------------------------------------===// @@ -59,7 +78,9 @@ #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SparseBitVector.h" #include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetRegisterInfo.h" @@ -71,10 +92,18 @@ class SIFixWWMLiveness : public MachineFunctionPass { private: + MachineDominatorTree *DomTree; + MachineLoopInfo *LoopInfo; LiveIntervals *LIS = nullptr; + const SIInstrInfo *TII; const SIRegisterInfo *TRI; MachineRegisterInfo *MRI; + std::vector WWMs; + std::vector ThenDefs; + std::vector> LoopExitDefs; + std::vector> LoopPhiDefs; + public: static char ID; @@ -84,13 +113,11 @@ bool runOnMachineFunction(MachineFunction &MF) override; - bool runOnWWMInstruction(MachineInstr &MI); - - void addDefs(const MachineInstr &MI, SparseBitVector<> &set); - StringRef getPassName() const override { return "SI Fix WWM Liveness"; } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequiredID(MachineDominatorsID); + AU.addRequiredID(MachineLoopInfoID); // Should preserve the same set that TwoAddressInstructions does. AU.addPreserved(); AU.addPreserved(); @@ -100,11 +127,21 @@ AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } + +private: + void processDef(MachineOperand &DefOpnd); + bool processThenDef(MachineOperand *DefOpnd); + bool processLoopExitDef(MachineOperand *DefOpnd, MachineLoop *Loop); + bool processLoopPhiDef(MachineOperand *DefOpnd, MachineLoop *Loop); }; } // End anonymous namespace. -INITIALIZE_PASS(SIFixWWMLiveness, DEBUG_TYPE, +INITIALIZE_PASS_BEGIN(SIFixWWMLiveness, DEBUG_TYPE, + "SI fix WWM liveness", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_END(SIFixWWMLiveness, DEBUG_TYPE, "SI fix WWM liveness", false, false) char SIFixWWMLiveness::ID = 0; @@ -115,89 +152,267 @@ return new SIFixWWMLiveness(); } -void SIFixWWMLiveness::addDefs(const MachineInstr &MI, SparseBitVector<> &Regs) -{ - for (const MachineOperand &Op : MI.defs()) { - if (Op.isReg()) { - unsigned Reg = Op.getReg(); - if (TRI->isVGPR(*MRI, Reg)) - Regs.set(Reg); - } - } -} +bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) { + LLVM_DEBUG(dbgs() << "SIFixWWMLiveness: function " << MF.getName() << "\n"); + bool Modified = false; -bool SIFixWWMLiveness::runOnWWMInstruction(MachineInstr &WWM) { - MachineBasicBlock *MBB = WWM.getParent(); + // This doesn't actually need LiveIntervals, but we can preserve them. + LIS = getAnalysisIfAvailable(); - // Compute the registers that are live out of MI by figuring out which defs - // are reachable from MI. - SparseBitVector<> LiveOut; + const GCNSubtarget &ST = MF.getSubtarget(); - for (auto II = MachineBasicBlock::iterator(WWM), IE = - MBB->end(); II != IE; ++II) { - addDefs(*II, LiveOut); - } + TII = ST.getInstrInfo(); + TRI = &TII->getRegisterInfo(); + MRI = &MF.getRegInfo(); + + DomTree = &getAnalysis(); + LoopInfo = &getAnalysis(); - for (df_iterator I = ++df_begin(MBB), - E = df_end(MBB); - I != E; ++I) { - for (const MachineInstr &MI : **I) { - addDefs(MI, LiveOut); + // Scan the function to find the WWM sections and the candidate registers for + // having liveness modified. + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (MI.getOpcode() == AMDGPU::EXIT_WWM) + WWMs.push_back(&MI); + else { + for (MachineOperand &DefOpnd : MI.defs()) { + if (DefOpnd.isReg()) { + unsigned Reg = DefOpnd.getReg(); + if (TRI->isVGPR(*MRI, Reg)) + processDef(DefOpnd); + } + } + } } } + if (!WWMs.empty()) { + // Synthesize liveness over WWM sections as required. + for (auto ThenDef : ThenDefs) + Modified |= processThenDef(ThenDef); + for (auto LoopExitDef : LoopExitDefs) + Modified |= processLoopExitDef(LoopExitDef.first, LoopExitDef.second); + for (auto LoopPhiDef : LoopPhiDefs) + Modified |= processLoopPhiDef(LoopPhiDef.first, LoopPhiDef.second); + } - // Compute the registers that reach MI. - SparseBitVector<> Reachable; + WWMs.clear(); + ThenDefs.clear(); + LoopExitDefs.clear(); + LoopPhiDefs.clear(); - for (auto II = ++MachineBasicBlock::reverse_iterator(WWM), IE = - MBB->rend(); II != IE; ++II) { - addDefs(*II, Reachable); - } + return Modified; +} - for (idf_iterator I = ++idf_begin(MBB), - E = idf_end(MBB); - I != E; ++I) { - for (const MachineInstr &MI : **I) { - addDefs(MI, Reachable); +// During the function scan, process an operand that defines a VGPR. +// This categorizes the register and puts it in the appropriate list for later +// use when processing a WWM section. +void SIFixWWMLiveness::processDef(MachineOperand &DefOpnd) { + unsigned Reg = DefOpnd.getReg(); + // Get all the defining instructions. For convenience, make Defs[0] the def + // we are on now. + SmallVector Defs; + Defs.push_back(DefOpnd.getParent()); + for (auto &MI : MRI->def_instructions(Reg)) { + if (&MI != DefOpnd.getParent()) + Defs.push_back(&MI); + } + // Check whether this def dominates all the others. If not, ignore this def. + // Either it is going to be processed when the scan encounters its other def + // that dominates all defs, or there is no def that dominates all others. + // The latter case is an eliminated phi from an if..else..endif or similar, + // which must be for uniform control flow so can be ignored. + // Because this pass runs shortly after PHIElimination, we assume that any + // multi-def register is a lowered phi, and thus has each def in a separate + // basic block. + for (unsigned I = 1; I != Defs.size(); ++I) { + if (!DomTree->dominates(Defs[0]->getParent(), Defs[I]->getParent())) + return; + } + // Check for the case of an if..endif lowered phi: It has two defs, one + // dominates the other, and there is a single use in a successor of the + // dominant def. + // Later we will spot any WWM code inside + // the "then" clause and turn the second def into a partial def so its + // liveness goes through the WWM code in the "then" clause. + if (Defs.size() == 2) { + auto DomDefBlock = Defs[0]->getParent(); + if (DomDefBlock->succ_size() == 2 && MRI->hasOneUse(Reg)) { + auto UseBlock = MRI->use_begin(Reg)->getParent()->getParent(); + for (auto Succ : DomDefBlock->successors()) { + if (Succ == UseBlock) { + LLVM_DEBUG(dbgs() << printReg(Reg, TRI) << " is a then phi reg\n"); + ThenDefs.push_back(&DefOpnd); + return; + } + } } } - - // find the intersection, and add implicit uses. - LiveOut &= Reachable; - - bool Modified = false; - for (unsigned Reg : LiveOut) { - WWM.addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true)); - if (LIS) { - // FIXME: is there a better way to update the live interval? - LIS->removeInterval(Reg); - LIS->createAndComputeVirtRegInterval(Reg); + // Check for the case of a non-lowered-phi register (single def) that exits + // a loop, that is, it has a use that is outside a loop that the def is + // inside. We find the outermost loop that the def is inside but a use is + // outside. Later we will spot any WWM code inside that loop and then make + // the def a partial def so its liveness goes round the loop and through the + // WWM code. + if (Defs.size() == 1) { + auto Loop = LoopInfo->getLoopFor(Defs[0]->getParent()); + if (!Loop) + return; + bool IsLoopExit = false; + for (auto &Use : MRI->use_instructions(Reg)) { + auto UseBlock = Use.getParent(); + if (Loop->contains(UseBlock)) + continue; + IsLoopExit = true; + while (auto Parent = Loop->getParentLoop()) { + if (Parent->contains(UseBlock)) + break; + Loop = Parent; + } } - Modified = true; + if (!IsLoopExit) + return; + LLVM_DEBUG(dbgs() << printReg(Reg, TRI) + << " is a loop exit reg with loop header at " + << "bb." << Loop->getHeader()->getNumber() << "\n"); + LoopExitDefs.push_back(std::pair( + &DefOpnd, Loop)); + return; } - - return Modified; + // Check for the case of a lowered single-preheader-loop phi, that is, a + // multi-def register where the dominating def is in the loop pre-header and + // all other defs are in backedges. Later we will spot any WWM code inside + // that loop and then make the backedge defs partial defs so the liveness + // goes through the WWM code. + // Note that we are ignoring multi-preheader loops on the basis that the + // structurizer does not allow that for non-uniform loops. + // There must be a single use in the loop header. + if (!MRI->hasOneUse(Reg)) + return; + auto UseBlock = MRI->use_begin(Reg)->getParent()->getParent(); + auto Loop = LoopInfo->getLoopFor(UseBlock); + if (!Loop || Loop->getHeader() != UseBlock + || Loop->contains(Defs[0]->getParent())) { + LLVM_DEBUG(dbgs() << printReg(Reg, TRI) + << " is multi-def but single use not in loop header\n"); + return; + } + for (unsigned I = 1; I != Defs.size(); ++I) { + if (!Loop->contains(Defs[I]->getParent())) + return; + } + LLVM_DEBUG(dbgs() << printReg(Reg, TRI) + << " is a loop phi reg with loop header at " + << "bb." << Loop->getHeader()->getNumber() << "\n"); + LoopPhiDefs.push_back( + std::pair(&DefOpnd, Loop)); } -bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) { - bool Modified = false; - - // This doesn't actually need LiveIntervals, but we can preserve them. - LIS = getAnalysisIfAvailable(); - - const GCNSubtarget &ST = MF.getSubtarget(); - const SIInstrInfo *TII = ST.getInstrInfo(); - - TRI = &TII->getRegisterInfo(); - MRI = &MF.getRegInfo(); +// Process a then phi def: It has two defs, one dominates the other, and there +// is a single use in a successor of the dominant def. Here we spot any WWM +// code inside the "then" clause and turn the second def into a partial def so +// its liveness goes through the WWM code in the "then" clause. +bool SIFixWWMLiveness::processThenDef(MachineOperand *DefOpnd) { + LLVM_DEBUG(dbgs() << "Processing then def: " << *DefOpnd->getParent()); + if (DefOpnd->getParent()->getOpcode() == TargetOpcode::IMPLICIT_DEF) { + // Ignore if dominating def is undef. + LLVM_DEBUG(dbgs() << " ignoring as dominating def is undef\n"); + return false; + } + unsigned Reg = DefOpnd->getReg(); + // Get the use block, which is the endif block. + auto UseBlock = MRI->use_instr_begin(Reg)->getParent(); + // Check whether there is WWM code inside the then branch. The WWM code must + // be dominated by the if but not dominated by the endif. + bool ContainsWWM = false; + for (auto WWM : WWMs) { + if (DomTree->dominates(DefOpnd->getParent()->getParent(), WWM->getParent()) + && !DomTree->dominates(UseBlock, WWM->getParent())) { + LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM); + ContainsWWM = true; + break; + } + } + if (!ContainsWWM) + return false; + // Get the other def. + MachineInstr *OtherDef = nullptr; + for (auto &MI : MRI->def_instructions(Reg)) { + if (&MI != DefOpnd->getParent()) + OtherDef = &MI; + } + // Make it a partial def. + OtherDef->addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true)); + LLVM_DEBUG(dbgs() << *OtherDef); + return true; +} - for (MachineBasicBlock &MBB : MF) { - for (MachineInstr &MI : MBB) { - if (MI.getOpcode() == AMDGPU::EXIT_WWM) { - Modified |= runOnWWMInstruction(MI); - } +// Process a loop exit def, that is, a register with a single use in a loop +// that has a use outside the loop. Here we spot any WWM code inside that loop +// and then make the def a partial def so its liveness goes round the loop and +// through the WWM code. +bool SIFixWWMLiveness::processLoopExitDef(MachineOperand *DefOpnd, + MachineLoop *Loop) { + LLVM_DEBUG(dbgs() << "Processing loop exit def: " << *DefOpnd->getParent()); + // Check whether there is WWM code inside the loop. + bool ContainsWWM = false; + for (auto WWM : WWMs) { + if (Loop->contains(WWM->getParent())) { + LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM); + ContainsWWM = true; + break; } } + if (!ContainsWWM) + return false; + unsigned Reg = DefOpnd->getReg(); + // Add a new implicit_def in loop preheader(s). + for (auto Pred : Loop->getHeader()->predecessors()) { + if (!Loop->contains(Pred)) { + auto ImplicitDef = BuildMI(*Pred, Pred->getFirstTerminator(), DebugLoc(), + TII->get(TargetOpcode::IMPLICIT_DEF), Reg); + LLVM_DEBUG(dbgs() << *ImplicitDef); + (void)ImplicitDef; + } + } + // Make the original def partial. + DefOpnd->getParent()->addOperand(MachineOperand::CreateReg( + Reg, false, /*isImp=*/true)); + LLVM_DEBUG(dbgs() << *DefOpnd->getParent()); + return true; +} - return Modified; +// Process a loop phi def, that is, a multi-def register where the dominating +// def is in the loop pre-header and all other defs are in backedges. Here we +// spot any WWM code inside that loop and then make the backedge defs partial +// defs so the liveness goes through the WWM code. +bool SIFixWWMLiveness::processLoopPhiDef(MachineOperand *DefOpnd, + MachineLoop *Loop) { + LLVM_DEBUG(dbgs() << "Processing loop phi def: " << *DefOpnd->getParent()); + // Check whether there is WWM code inside the loop. + bool ContainsWWM = false; + for (auto WWM : WWMs) { + if (Loop->contains(WWM->getParent())) { + LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM); + ContainsWWM = true; + break; + } + } + if (!ContainsWWM) + return false; + unsigned Reg = DefOpnd->getReg(); + // Remove kill mark from uses. + for (auto &Use : MRI->use_operands(Reg)) + Use.setIsKill(false); + // Make all defs except the dominating one partial defs. + SmallVector Defs; + for (auto &Def : MRI->def_instructions(Reg)) + Defs.push_back(&Def); + for (auto Def : Defs) { + if (DefOpnd->getParent() == Def) + continue; + Def->addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true)); + LLVM_DEBUG(dbgs() << *Def); + } + return true; } + Index: llvm/trunk/test/CodeGen/AMDGPU/fix-wwm-liveness.mir =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/fix-wwm-liveness.mir +++ llvm/trunk/test/CodeGen/AMDGPU/fix-wwm-liveness.mir @@ -1,8 +1,11 @@ # RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-fix-wwm-liveness -o - %s | FileCheck %s -#CHECK: $exec = EXIT_WWM killed %19, implicit %21 + +# Test a then phi value. +#CHECK: test_wwm_liveness_then_phi +#CHECK: %21:vgpr_32 = V_MOV_B32_e32 1, implicit $exec, implicit %21 --- -name: test_wwm_liveness +name: test_wwm_liveness_then_phi alignment: 0 exposesReturnsTwice: false legalized: false @@ -71,3 +74,112 @@ SI_RETURN_TO_EPILOG killed $vgpr0 ... + +# Test a loop with a loop exit value and a loop phi. +#CHECK: test_wwm_liveness_loop +#CHECK: %4:vgpr_32 = IMPLICIT_DEF +#CHECK: bb.1: +#CHECK: %4:vgpr_32 = FLAT_LOAD_DWORD{{.*}}, implicit %4 +#CHECK: %27:vgpr_32 = COPY killed %21, implicit %27 + +--- +name: test_wwm_liveness_loop +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32, preferred-register: '' } + - { id: 1, class: sreg_32_xm0, preferred-register: '' } + - { id: 2, class: sreg_64, preferred-register: '' } + - { id: 3, class: sreg_32_xm0, preferred-register: '' } + - { id: 4, class: vgpr_32, preferred-register: '' } + - { id: 5, class: sreg_32_xm0, preferred-register: '' } + - { id: 6, class: sreg_64, preferred-register: '' } + - { id: 7, class: sreg_64, preferred-register: '' } + - { id: 8, class: sreg_64, preferred-register: '' } + - { id: 9, class: vreg_64, preferred-register: '' } + - { id: 10, class: vgpr_32, preferred-register: '' } + - { id: 11, class: vgpr_32, preferred-register: '' } + - { id: 12, class: vgpr_32, preferred-register: '' } + - { id: 13, class: sreg_64, preferred-register: '' } + - { id: 14, class: vreg_64, preferred-register: '' } + - { id: 15, class: sreg_32_xm0, preferred-register: '' } + - { id: 16, class: vgpr_32, preferred-register: '' } + - { id: 17, class: sreg_64, preferred-register: '$vcc' } + - { id: 18, class: vgpr_32, preferred-register: '' } + - { id: 19, class: vgpr_32, preferred-register: '' } + - { id: 20, class: vgpr_32, preferred-register: '' } + - { id: 21, class: vgpr_32, preferred-register: '' } + - { id: 22, class: vgpr_32, preferred-register: '' } + - { id: 23, class: sreg_64, preferred-register: '' } + - { id: 24, class: sreg_64, preferred-register: '' } + - { id: 25, class: sreg_64, preferred-register: '' } + - { id: 26, class: sreg_64, preferred-register: '' } + - { id: 27, class: vgpr_32, preferred-register: '' } +liveins: +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0: + successors: %bb.1(0x80000000) + + %25:sreg_64 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + %0:vgpr_32 = FLAT_LOAD_DWORD undef %9:vreg_64, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* undef`, addrspace 1) + $exec = EXIT_WWM killed %25 + %12:vgpr_32 = V_MBCNT_LO_U32_B32_e64 -1, 0, implicit $exec + %7:sreg_64 = S_MOV_B64 0 + %26:sreg_64 = COPY killed %7 + %27:vgpr_32 = COPY killed %12 + + bb.1: + successors: %bb.2(0x04000000), %bb.1(0x7c000000) + + %24:sreg_64 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + %20:vgpr_32 = COPY killed %27 + %2:sreg_64 = COPY killed %26 + %4:vgpr_32 = FLAT_LOAD_DWORD undef %14:vreg_64, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* undef`, addrspace 1) + $exec = EXIT_WWM killed %24 + %22:vgpr_32 = V_ADD_I32_e32 -1, killed %20, implicit-def dead $vcc, implicit $exec + %17:sreg_64 = V_CMP_EQ_U32_e64 0, %22, implicit $exec + %6:sreg_64 = S_OR_B64 killed %17, killed %2, implicit-def $scc + %21:vgpr_32 = COPY killed %22 + %26:sreg_64 = COPY %6 + %27:vgpr_32 = COPY killed %21 + $exec = S_ANDN2_B64_term $exec, %6 + S_CBRANCH_EXECNZ %bb.1, implicit $exec + S_BRANCH %bb.2 + + bb.2: + $exec = S_OR_B64 $exec, killed %6, implicit-def $scc + %23:sreg_64 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + %18:vgpr_32 = V_ADD_F32_e32 killed %0, killed %4, implicit $exec + $exec = EXIT_WWM killed %23 + early-clobber %19:vgpr_32 = COPY killed %18, implicit $exec + $vgpr0 = COPY killed %19 + SI_RETURN_TO_EPILOG killed $vgpr0 + +... + Index: llvm/trunk/test/CodeGen/AMDGPU/wqm.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/wqm.ll +++ llvm/trunk/test/CodeGen/AMDGPU/wqm.ll @@ -260,8 +260,9 @@ } ; Check that WWM is turned on correctly across basic block boundaries. +; if..then..endif version ; -;CHECK-LABEL: {{^}}test_wwm6: +;CHECK-LABEL: {{^}}test_wwm6_then: ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 ;SI-CHECK: buffer_load_dword ;VI-CHECK: flat_load_dword @@ -272,7 +273,7 @@ ;VI-CHECK: flat_load_dword ;CHECK: v_add_f32_e32 ;CHECK: s_mov_b64 exec, [[ORIG2]] -define amdgpu_ps float @test_wwm6() { +define amdgpu_ps float @test_wwm6_then() { main_body: %src0 = load volatile float, float addrspace(1)* undef ; use mbcnt to make sure the branch is divergent @@ -292,6 +293,40 @@ ret float %out.1 } +; Check that WWM is turned on correctly across basic block boundaries. +; loop version +; +;CHECK-LABEL: {{^}}test_wwm6_loop: +;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 +;SI-CHECK: buffer_load_dword +;VI-CHECK: flat_load_dword +;CHECK: s_mov_b64 exec, [[ORIG]] +;CHECK: %loop +;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1 +;SI-CHECK: buffer_load_dword +;VI-CHECK: flat_load_dword +;CHECK: s_mov_b64 exec, [[ORIG2]] +define amdgpu_ps float @test_wwm6_loop() { +main_body: + %src0 = load volatile float, float addrspace(1)* undef + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + br label %loop + +loop: + %counter = phi i32 [ %lo, %main_body ], [ %counter.1, %loop ] + %src1 = load volatile float, float addrspace(1)* undef + %out = fadd float %src0, %src1 + %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) + %counter.1 = sub i32 %counter, 1 + %cc = icmp ne i32 %counter.1, 0 + br i1 %cc, label %loop, label %endloop + +endloop: + ret float %out.0 +} + ; Check that @llvm.amdgcn.set.inactive disables WWM. ; ;CHECK-LABEL: {{^}}test_set_inactive1: