Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -51,7 +51,7 @@ FunctionPass *createSIFixSGPRCopiesPass(); FunctionPass *createSIMemoryLegalizerPass(); FunctionPass *createSIInsertWaitcntsPass(); -FunctionPass *createSIFixWWMLivenessPass(); +FunctionPass *createSIPreAllocateWWMRegsPass(); FunctionPass *createSIFormMemoryClausesPass(); FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &); FunctionPass *createAMDGPUUseNativeCallsPass(); @@ -148,8 +148,8 @@ void initializeSIOptimizeExecMaskingPass(PassRegistry &); extern char &SIOptimizeExecMaskingID; -void initializeSIFixWWMLivenessPass(PassRegistry &); -extern char &SIFixWWMLivenessID; +void initializeSIPreAllocateWWMRegsPass(PassRegistry &); +extern char &SIPreAllocateWWMRegsID; void initializeAMDGPUSimplifyLibCallsPass(PassRegistry &); extern char &AMDGPUSimplifyLibCallsID; Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -201,7 +201,7 @@ initializeSIInsertSkipsPass(*PR); initializeSIMemoryLegalizerPass(*PR); initializeSIOptimizeExecMaskingPass(*PR); - initializeSIFixWWMLivenessPass(*PR); + initializeSIPreAllocateWWMRegsPass(*PR); initializeSIFormMemoryClausesPass(*PR); initializeAMDGPUUnifyDivergentExitNodesPass(*PR); initializeAMDGPUAAWrapperPassPass(*PR); @@ -870,9 +870,9 @@ // SI_ELSE will introduce a copy of the tied operand source after the else. insertPass(&PHIEliminationID, &SILowerControlFlowID, false); - // This must be run after SILowerControlFlow, since it needs to use the - // machine-level CFG, but before register allocation. - insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false); + // This must be run just before RegisterCoalescing, which runs just after + // TwoAddressInstructions. + insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID, false); TargetPassConfig::addFastRegAlloc(RegAllocPass); } @@ -887,9 +887,9 @@ // SI_ELSE will introduce a copy of the tied operand source after the else. insertPass(&PHIEliminationID, &SILowerControlFlowID, false); - // This must be run after SILowerControlFlow, since it needs to use the - // machine-level CFG, but before register allocation. - insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false); + // This must be run just before RegisterCoalescing, which runs just after + // TwoAddressInstructions. + insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID, false); TargetPassConfig::addOptimizedRegAlloc(RegAllocPass); } Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -95,7 +95,7 @@ SIFixSGPRCopies.cpp SIFixupVectorISel.cpp SIFixVGPRCopies.cpp - SIFixWWMLiveness.cpp + SIPreAllocateWWMRegs.cpp SIFoldOperands.cpp SIFormMemoryClauses.cpp SIFrameLowering.cpp Index: lib/Target/AMDGPU/SIFixWWMLiveness.cpp =================================================================== --- lib/Target/AMDGPU/SIFixWWMLiveness.cpp +++ /dev/null @@ -1,417 +0,0 @@ -//===-- SIFixWWMLiveness.cpp - Fix WWM live intervals ---------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file -/// Computations in WWM can overwrite values in inactive channels for -/// variables that the register allocator thinks are dead. This pass adds fake -/// uses of those variables to their def(s) to make sure that they aren't -/// overwritten. -/// -/// As an example, consider this snippet: -/// %vgpr0 = V_MOV_B32_e32 0.0 -/// if (...) { -/// %vgpr1 = ... -/// %vgpr2 = WWM killed %vgpr1 -/// ... = killed %vgpr2 -/// %vgpr0 = V_MOV_B32_e32 1.0 -/// } -/// ... = %vgpr0 -/// -/// The live intervals of %vgpr0 don't overlap with those of %vgpr1. Normally, -/// we can safely allocate %vgpr0 and %vgpr1 in the same register, since -/// writing %vgpr1 would only write to channels that would be clobbered by the -/// second write to %vgpr0 anyways. But if %vgpr1 is written with WWM enabled, -/// it would clobber even the inactive channels for which the if-condition is -/// false, for which %vgpr0 is supposed to be 0. This pass adds an implicit use -/// of %vgpr0 to its def to make sure they aren't allocated to the -/// same register. -/// -/// In general, we need to figure out what registers might have their inactive -/// channels which are eventually used accidentally clobbered by a WWM -/// instruction. We do that by spotting three separate cases of registers: -/// -/// 1. A "then phi": the value resulting from phi elimination of a phi node at -/// the end of an if..endif. If there is WWM code in the "then", then we -/// make the def at the end of the "then" branch a partial def by adding an -/// implicit use of the register. -/// -/// 2. A "loop exit register": a value written inside a loop but used outside the -/// loop, where there is WWM code inside the loop (the case in the example -/// above). We add an implicit_def of the register in the loop pre-header, -/// and make the original def a partial def by adding an implicit use of the -/// register. -/// -/// 3. A "loop exit phi": the value resulting from phi elimination of a phi node -/// in a loop header. If there is WWM code inside the loop, then we make all -/// defs inside the loop partial defs by adding an implicit use of the -/// register on each one. -/// -/// Note that we do not need to consider an if..else..endif phi. We only need to -/// consider non-uniform control flow, and control flow structurization would -/// have transformed a non-uniform if..else..endif into two if..endifs. -/// -/// The analysis to detect these cases relies on a property of the MIR -/// arising from this pass running straight after PHIElimination and before any -/// coalescing: that any virtual register with more than one definition must be -/// the new register added to lower a phi node by PHIElimination. -/// -/// FIXME: We should detect whether a register in one of the above categories is -/// already live at the WWM code before deciding to add the implicit uses to -/// synthesize its liveness. -/// -/// FIXME: I believe this whole scheme may be flawed due to the possibility of -/// the register allocator doing live interval splitting. -/// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "SIInstrInfo.h" -#include "SIRegisterInfo.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/SparseBitVector.h" -#include "llvm/CodeGen/LiveIntervals.h" -#include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineLoopInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/CodeGen/TargetRegisterInfo.h" - -using namespace llvm; - -#define DEBUG_TYPE "si-fix-wwm-liveness" - -namespace { - -class SIFixWWMLiveness : public MachineFunctionPass { -private: - MachineDominatorTree *DomTree; - MachineLoopInfo *LoopInfo; - LiveIntervals *LIS = nullptr; - const SIInstrInfo *TII; - const SIRegisterInfo *TRI; - MachineRegisterInfo *MRI; - - std::vector WWMs; - std::vector ThenDefs; - std::vector> LoopExitDefs; - std::vector> LoopPhiDefs; - -public: - static char ID; - - SIFixWWMLiveness() : MachineFunctionPass(ID) { - initializeSIFixWWMLivenessPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - StringRef getPassName() const override { return "SI Fix WWM Liveness"; } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequiredID(MachineDominatorsID); - AU.addRequiredID(MachineLoopInfoID); - // Should preserve the same set that TwoAddressInstructions does. - AU.addPreserved(); - AU.addPreserved(); - AU.addPreservedID(LiveVariablesID); - AU.addPreservedID(MachineLoopInfoID); - AU.addPreservedID(MachineDominatorsID); - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } - -private: - void processDef(MachineOperand &DefOpnd); - bool processThenDef(MachineOperand *DefOpnd); - bool processLoopExitDef(MachineOperand *DefOpnd, MachineLoop *Loop); - bool processLoopPhiDef(MachineOperand *DefOpnd, MachineLoop *Loop); -}; - -} // End anonymous namespace. - -INITIALIZE_PASS_BEGIN(SIFixWWMLiveness, DEBUG_TYPE, - "SI fix WWM liveness", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_END(SIFixWWMLiveness, DEBUG_TYPE, - "SI fix WWM liveness", false, false) - -char SIFixWWMLiveness::ID = 0; - -char &llvm::SIFixWWMLivenessID = SIFixWWMLiveness::ID; - -FunctionPass *llvm::createSIFixWWMLivenessPass() { - return new SIFixWWMLiveness(); -} - -bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) { - LLVM_DEBUG(dbgs() << "SIFixWWMLiveness: function " << MF.getName() << "\n"); - bool Modified = false; - - // This doesn't actually need LiveIntervals, but we can preserve them. - LIS = getAnalysisIfAvailable(); - - const GCNSubtarget &ST = MF.getSubtarget(); - - TII = ST.getInstrInfo(); - TRI = &TII->getRegisterInfo(); - MRI = &MF.getRegInfo(); - - DomTree = &getAnalysis(); - LoopInfo = &getAnalysis(); - - // Scan the function to find the WWM sections and the candidate registers for - // having liveness modified. - for (MachineBasicBlock &MBB : MF) { - for (MachineInstr &MI : MBB) { - if (MI.getOpcode() == AMDGPU::EXIT_WWM) - WWMs.push_back(&MI); - else { - for (MachineOperand &DefOpnd : MI.defs()) { - if (DefOpnd.isReg()) { - unsigned Reg = DefOpnd.getReg(); - if (TRI->isVGPR(*MRI, Reg)) - processDef(DefOpnd); - } - } - } - } - } - if (!WWMs.empty()) { - // Synthesize liveness over WWM sections as required. - for (auto ThenDef : ThenDefs) - Modified |= processThenDef(ThenDef); - for (auto LoopExitDef : LoopExitDefs) - Modified |= processLoopExitDef(LoopExitDef.first, LoopExitDef.second); - for (auto LoopPhiDef : LoopPhiDefs) - Modified |= processLoopPhiDef(LoopPhiDef.first, LoopPhiDef.second); - } - - WWMs.clear(); - ThenDefs.clear(); - LoopExitDefs.clear(); - LoopPhiDefs.clear(); - - return Modified; -} - -// During the function scan, process an operand that defines a VGPR. -// This categorizes the register and puts it in the appropriate list for later -// use when processing a WWM section. -void SIFixWWMLiveness::processDef(MachineOperand &DefOpnd) { - unsigned Reg = DefOpnd.getReg(); - // Get all the defining instructions. For convenience, make Defs[0] the def - // we are on now. - SmallVector Defs; - Defs.push_back(DefOpnd.getParent()); - for (auto &MI : MRI->def_instructions(Reg)) { - if (&MI != DefOpnd.getParent()) - Defs.push_back(&MI); - } - // Check whether this def dominates all the others. If not, ignore this def. - // Either it is going to be processed when the scan encounters its other def - // that dominates all defs, or there is no def that dominates all others. - // The latter case is an eliminated phi from an if..else..endif or similar, - // which must be for uniform control flow so can be ignored. - // Because this pass runs shortly after PHIElimination, we assume that any - // multi-def register is a lowered phi, and thus has each def in a separate - // basic block. - for (unsigned I = 1; I != Defs.size(); ++I) { - if (!DomTree->dominates(Defs[0]->getParent(), Defs[I]->getParent())) - return; - } - // Check for the case of an if..endif lowered phi: It has two defs, one - // dominates the other, and there is a single use in a successor of the - // dominant def. - // Later we will spot any WWM code inside - // the "then" clause and turn the second def into a partial def so its - // liveness goes through the WWM code in the "then" clause. - if (Defs.size() == 2) { - auto DomDefBlock = Defs[0]->getParent(); - if (DomDefBlock->succ_size() == 2 && MRI->hasOneUse(Reg)) { - auto UseBlock = MRI->use_begin(Reg)->getParent()->getParent(); - for (auto Succ : DomDefBlock->successors()) { - if (Succ == UseBlock) { - LLVM_DEBUG(dbgs() << printReg(Reg, TRI) << " is a then phi reg\n"); - ThenDefs.push_back(&DefOpnd); - return; - } - } - } - } - // Check for the case of a non-lowered-phi register (single def) that exits - // a loop, that is, it has a use that is outside a loop that the def is - // inside. We find the outermost loop that the def is inside but a use is - // outside. Later we will spot any WWM code inside that loop and then make - // the def a partial def so its liveness goes round the loop and through the - // WWM code. - if (Defs.size() == 1) { - auto Loop = LoopInfo->getLoopFor(Defs[0]->getParent()); - if (!Loop) - return; - bool IsLoopExit = false; - for (auto &Use : MRI->use_instructions(Reg)) { - auto UseBlock = Use.getParent(); - if (Loop->contains(UseBlock)) - continue; - IsLoopExit = true; - while (auto Parent = Loop->getParentLoop()) { - if (Parent->contains(UseBlock)) - break; - Loop = Parent; - } - } - if (!IsLoopExit) - return; - LLVM_DEBUG(dbgs() << printReg(Reg, TRI) - << " is a loop exit reg with loop header at " - << "bb." << Loop->getHeader()->getNumber() << "\n"); - LoopExitDefs.push_back(std::pair( - &DefOpnd, Loop)); - return; - } - // Check for the case of a lowered single-preheader-loop phi, that is, a - // multi-def register where the dominating def is in the loop pre-header and - // all other defs are in backedges. Later we will spot any WWM code inside - // that loop and then make the backedge defs partial defs so the liveness - // goes through the WWM code. - // Note that we are ignoring multi-preheader loops on the basis that the - // structurizer does not allow that for non-uniform loops. - // There must be a single use in the loop header. - if (!MRI->hasOneUse(Reg)) - return; - auto UseBlock = MRI->use_begin(Reg)->getParent()->getParent(); - auto Loop = LoopInfo->getLoopFor(UseBlock); - if (!Loop || Loop->getHeader() != UseBlock - || Loop->contains(Defs[0]->getParent())) { - LLVM_DEBUG(dbgs() << printReg(Reg, TRI) - << " is multi-def but single use not in loop header\n"); - return; - } - for (unsigned I = 1; I != Defs.size(); ++I) { - if (!Loop->contains(Defs[I]->getParent())) - return; - } - LLVM_DEBUG(dbgs() << printReg(Reg, TRI) - << " is a loop phi reg with loop header at " - << "bb." << Loop->getHeader()->getNumber() << "\n"); - LoopPhiDefs.push_back( - std::pair(&DefOpnd, Loop)); -} - -// Process a then phi def: It has two defs, one dominates the other, and there -// is a single use in a successor of the dominant def. Here we spot any WWM -// code inside the "then" clause and turn the second def into a partial def so -// its liveness goes through the WWM code in the "then" clause. -bool SIFixWWMLiveness::processThenDef(MachineOperand *DefOpnd) { - LLVM_DEBUG(dbgs() << "Processing then def: " << *DefOpnd->getParent()); - if (DefOpnd->getParent()->getOpcode() == TargetOpcode::IMPLICIT_DEF) { - // Ignore if dominating def is undef. - LLVM_DEBUG(dbgs() << " ignoring as dominating def is undef\n"); - return false; - } - unsigned Reg = DefOpnd->getReg(); - // Get the use block, which is the endif block. - auto UseBlock = MRI->use_instr_begin(Reg)->getParent(); - // Check whether there is WWM code inside the then branch. The WWM code must - // be dominated by the if but not dominated by the endif. - bool ContainsWWM = false; - for (auto WWM : WWMs) { - if (DomTree->dominates(DefOpnd->getParent()->getParent(), WWM->getParent()) - && !DomTree->dominates(UseBlock, WWM->getParent())) { - LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM); - ContainsWWM = true; - break; - } - } - if (!ContainsWWM) - return false; - // Get the other def. - MachineInstr *OtherDef = nullptr; - for (auto &MI : MRI->def_instructions(Reg)) { - if (&MI != DefOpnd->getParent()) - OtherDef = &MI; - } - // Make it a partial def. - OtherDef->addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true)); - LLVM_DEBUG(dbgs() << *OtherDef); - return true; -} - -// Process a loop exit def, that is, a register with a single use in a loop -// that has a use outside the loop. Here we spot any WWM code inside that loop -// and then make the def a partial def so its liveness goes round the loop and -// through the WWM code. -bool SIFixWWMLiveness::processLoopExitDef(MachineOperand *DefOpnd, - MachineLoop *Loop) { - LLVM_DEBUG(dbgs() << "Processing loop exit def: " << *DefOpnd->getParent()); - // Check whether there is WWM code inside the loop. - bool ContainsWWM = false; - for (auto WWM : WWMs) { - if (Loop->contains(WWM->getParent())) { - LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM); - ContainsWWM = true; - break; - } - } - if (!ContainsWWM) - return false; - unsigned Reg = DefOpnd->getReg(); - // Add a new implicit_def in loop preheader(s). - for (auto Pred : Loop->getHeader()->predecessors()) { - if (!Loop->contains(Pred)) { - auto ImplicitDef = BuildMI(*Pred, Pred->getFirstTerminator(), DebugLoc(), - TII->get(TargetOpcode::IMPLICIT_DEF), Reg); - LLVM_DEBUG(dbgs() << *ImplicitDef); - (void)ImplicitDef; - } - } - // Make the original def partial. - DefOpnd->getParent()->addOperand(MachineOperand::CreateReg( - Reg, false, /*isImp=*/true)); - LLVM_DEBUG(dbgs() << *DefOpnd->getParent()); - return true; -} - -// Process a loop phi def, that is, a multi-def register where the dominating -// def is in the loop pre-header and all other defs are in backedges. Here we -// spot any WWM code inside that loop and then make the backedge defs partial -// defs so the liveness goes through the WWM code. -bool SIFixWWMLiveness::processLoopPhiDef(MachineOperand *DefOpnd, - MachineLoop *Loop) { - LLVM_DEBUG(dbgs() << "Processing loop phi def: " << *DefOpnd->getParent()); - // Check whether there is WWM code inside the loop. - bool ContainsWWM = false; - for (auto WWM : WWMs) { - if (Loop->contains(WWM->getParent())) { - LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM); - ContainsWWM = true; - break; - } - } - if (!ContainsWWM) - return false; - unsigned Reg = DefOpnd->getReg(); - // Remove kill mark from uses. - for (auto &Use : MRI->use_operands(Reg)) - Use.setIsKill(false); - // Make all defs except the dominating one partial defs. - SmallVector Defs; - for (auto &Def : MRI->def_instructions(Reg)) - Defs.push_back(&Def); - for (auto Def : Defs) { - if (DefOpnd->getParent() == Def) - continue; - Def->addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true)); - LLVM_DEBUG(dbgs() << *Def); - } - return true; -} - Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1286,9 +1286,15 @@ MI.eraseFromParent(); break; } + case AMDGPU::ENTER_WWM: { + // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when + // WWM is entered. + MI.setDesc(get(AMDGPU::S_OR_SAVEEXEC_B64)); + break; + } case AMDGPU::EXIT_WWM: { - // This only gets its own opcode so that SIFixWWMLiveness can tell when WWM - // is exited. + // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when + // WWM is exited. MI.setDesc(get(AMDGPU::S_MOV_B64)); break; } Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -121,6 +121,13 @@ } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] +def ENTER_WWM : SPseudoInstSI <(outs SReg_64:$sdst), (ins i64imm:$src0)> { + let Defs = [EXEC]; + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 0; +} + def EXIT_WWM : SPseudoInstSI <(outs SReg_64:$sdst), (ins SReg_64:$src0)> { let hasSideEffects = 0; let mayLoad = 0; Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -15,13 +15,14 @@ #include "AMDGPUArgumentUsageInfo.h" #include "AMDGPUMachineFunction.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIInstrInfo.h" #include "SIRegisterInfo.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/SparseBitVector.h" #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" @@ -206,6 +207,10 @@ SGPRSpillVGPRCSR(unsigned V, Optional F) : VGPR(V), FI(F) {} }; + SparseBitVector<> WWMReservedRegs; + + void ReserveWWMRegister(unsigned reg) { WWMReservedRegs.set(reg); } + private: // SGPR->VGPR spilling support. using SpillRegMask = std::pair; Index: lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp @@ -0,0 +1,253 @@ +//===-- SIPreAllocateWWMRegs.cpp - WWM Register Pre-allocation ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIRegisterInfo.h" +#include "SIMachineFunctionInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/LiveRegMatrix.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/RegisterClassInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-pre-allocate-wwm-regs" + +namespace { + +class SIPreAllocateWWMRegs : public MachineFunctionPass { +private: + const SIInstrInfo *TII; + const SIRegisterInfo *TRI; + MachineRegisterInfo *MRI; + LiveIntervals *LIS; + LiveRegMatrix *Matrix; + VirtRegMap *VRM; + RegisterClassInfo RegClassInfo; + + std::vector RegsToRewrite; + +public: + static char ID; + + SIPreAllocateWWMRegs() : MachineFunctionPass(ID) { + initializeSIPreAllocateWWMRegsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "SI Pre-allocate WWM Registers"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + bool processDef(MachineOperand &MO); + void rewriteRegs(MachineFunction &MF); +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SIPreAllocateWWMRegs, DEBUG_TYPE, + "SI Pre-allocate WWM Registers", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(VirtRegMap) +INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) +INITIALIZE_PASS_END(SIPreAllocateWWMRegs, DEBUG_TYPE, + "SI Pre-allocate WWM Registers", false, false) + +char SIPreAllocateWWMRegs::ID = 0; + +char &llvm::SIPreAllocateWWMRegsID = SIPreAllocateWWMRegs::ID; + +FunctionPass *llvm::createSIPreAllocateWWMRegsPass() { + return new SIPreAllocateWWMRegs(); +} + +bool SIPreAllocateWWMRegs::processDef(MachineOperand &MO) { + if (!MO.isReg()) + return false; + + unsigned Reg = MO.getReg(); + + if (!TRI->isVGPR(*MRI, Reg)) + return false; + + if (TRI->isPhysicalRegister(Reg)) + return false; + + if (VRM->hasPhys(Reg)) + return false; + + LiveInterval &LI = LIS->getInterval(Reg); + + for (unsigned PhysReg : RegClassInfo.getOrder(MRI->getRegClass(Reg))) { + if (!MRI->isPhysRegUsed(PhysReg) && + Matrix->checkInterference(LI, PhysReg) == LiveRegMatrix::IK_Free) { + Matrix->assign(LI, PhysReg); + assert(PhysReg != 0); + RegsToRewrite.push_back(Reg); + return true; + } + } + + llvm_unreachable("physreg not found for WWM expression"); + return false; +} + +void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) { + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + for (MachineOperand &MO : MI.operands()) { + if (!MO.isReg()) + continue; + + const unsigned VirtReg = MO.getReg(); + if (TRI->isPhysicalRegister(VirtReg)) + continue; + + if (!VRM->hasPhys(VirtReg)) + continue; + + unsigned PhysReg = VRM->getPhys(VirtReg); + const unsigned SubReg = MO.getSubReg(); + if (SubReg != 0) { + PhysReg = TRI->getSubReg(PhysReg, SubReg); + MO.setSubReg(0); + } + + MO.setReg(PhysReg); + MO.setIsRenamable(false); + } + } + } + + SIMachineFunctionInfo *MFI = MF.getInfo(); + + for (unsigned Reg : RegsToRewrite) { + LIS->removeInterval(Reg); + + const unsigned PhysReg = VRM->getPhys(Reg); + assert(PhysReg != 0); + MFI->ReserveWWMRegister(PhysReg); + + // Need to turn any COPY's into MOV's when the source register is one of our + // physical registers. + const unsigned MovOp = TII->getMovOpcode(TRI->getPhysRegClass(PhysReg)); + + for (MachineOperand &MO : MRI->reg_operands(PhysReg)) { + MachineInstr &MI = *MO.getParent(); + + // Only check copies. + if (MI.getOpcode() != AMDGPU::COPY) { + continue; + } + + assert(MI.getNumOperands() >= 2); + + MachineOperand &Dst = MI.getOperand(0); + + // If the destination is a physical register, skip. + if (!Dst.isReg() || TRI->isPhysicalRegister(Dst.getReg())) { + continue; + } + + MachineOperand &Src = MI.getOperand(1); + + // If the source wasn't our physical register, skip. + if (!Src.isReg() || (PhysReg != Src.getReg())) { + continue; + } + + // Change the MI into a mov. + MI.setDesc(TII->get(MovOp)); + + // And make it implicitly depend on exec (like all VALU movs should do). + MI.addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); + } + } + + RegsToRewrite.clear(); + + // Update the set of reserved registers to include WWM ones. + MRI->freezeReservedRegs(MF); +} + +bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) { + LLVM_DEBUG(dbgs() << "SIPreAllocateWWMRegs: function " << MF.getName() << "\n"); + + const GCNSubtarget &ST = MF.getSubtarget(); + + TII = ST.getInstrInfo(); + TRI = &TII->getRegisterInfo(); + MRI = &MF.getRegInfo(); + + LIS = &getAnalysis(); + Matrix = &getAnalysis(); + VRM = &getAnalysis(); + + RegClassInfo.runOnMachineFunction(MF); + + bool RegsAssigned = false; + + // We use a reverse post-order traversal of the control-flow graph to + // guarantee that we visit definitions in dominance order. Since WWM + // expressions are guaranteed to never involve phi nodes, and we can only + // escape WWM through the special WWM instruction, this means that this is a + // perfect elimination order, so we can never do any better. + ReversePostOrderTraversal RPOT(&MF); + + for (MachineBasicBlock *MBB : RPOT) { + bool InWWM = false; + for (MachineInstr &MI : *MBB) { + if (MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32) + RegsAssigned |= processDef(MI.getOperand(0)); + + if (MI.getOpcode() == AMDGPU::ENTER_WWM) { + LLVM_DEBUG(dbgs() << "entering WWM region: " << MI << "\n"); + InWWM = true; + continue; + } + + if (MI.getOpcode() == AMDGPU::EXIT_WWM) { + LLVM_DEBUG(dbgs() << "exiting WWM region: " << MI << "\n"); + InWWM = false; + } + + if (!InWWM) + continue; + + LLVM_DEBUG(dbgs() << "processing " << MI << "\n"); + + for (MachineOperand &DefOpnd : MI.defs()) { + RegsAssigned |= processDef(DefOpnd); + } + } + } + + if (!RegsAssigned) + return false; + + rewriteRegs(MF); + return true; +} Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -227,6 +227,10 @@ assert(!isSubRegister(ScratchRSrcReg, FrameReg)); } + for (unsigned Reg : MFI->WWMReservedRegs) { + reserveRegisterTuples(Reserved, Reg); + } + return Reserved; } Index: lib/Target/AMDGPU/SIWholeQuadMode.cpp =================================================================== --- lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -654,8 +654,7 @@ MachineInstr *MI; assert(SaveOrig); - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_OR_SAVEEXEC_B64), - SaveOrig) + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig) .addImm(-1); LIS->InsertMachineInstrInMaps(*MI); } Index: test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll =================================================================== --- test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -112,7 +112,7 @@ ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: buffer_atomic_sub v{{[0-9]+}} -; GFX8MORE: v_mov_b32_dpp v[[wave_shr1:[0-9]+]], v[[sub_value:[0-9]+]] wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8MORE: v_mov_b32_dpp v[[wave_shr1:[0-9]+]], v{{[0-9]+}} wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:2 row_mask:0xf bank_mask:0xf ; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:3 row_mask:0xf bank_mask:0xf @@ -120,8 +120,7 @@ ; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:8 row_mask:0xf bank_mask:0xc ; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8MORE: v_sub_u32_e32 v[[sub_value]],{{( vcc,)?}} v[[sub_value]], v{{[0-9]+}} -; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v[[sub_value]], 63 +; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: buffer_atomic_sub v[[value]] define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { Index: test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll =================================================================== --- test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -1,6 +1,6 @@ ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s declare i32 @llvm.amdgcn.workitem.id.x() @@ -133,9 +133,7 @@ ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: buffer_atomic_sub v{{[0-9]+}} -; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[sub_value:[0-9]+]] wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8MORE: v_sub_u32_e32 v[[sub_value]],{{( vcc,)?}} v[[sub_value]], v{{[0-9]+}} -; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v[[sub_value]], 63 +; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: buffer_atomic_sub v[[value]] define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { Index: test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll =================================================================== --- test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -136,9 +136,7 @@ ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[sub_value:[0-9]+]] wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8MORE: v_sub_u32_e32 v[[sub_value]],{{( vcc,)?}} v[[sub_value]], v{{[0-9]+}} -; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v[[sub_value]], 63 +; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { Index: test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll =================================================================== --- test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -104,9 +104,7 @@ ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: buffer_atomic_sub v{{[0-9]+}} -; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[sub_value:[0-9]+]] wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8MORE: v_sub_u32_e32 v[[sub_value]],{{( vcc,)?}} v[[sub_value]], v{{[0-9]+}} -; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v[[sub_value]], 63 +; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: buffer_atomic_sub v[[value]] define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { Index: test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll =================================================================== --- test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -117,9 +117,7 @@ ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: buffer_atomic_sub v{{[0-9]+}} -; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[sub_value:[0-9]+]] wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8MORE: v_sub_u32_e32 v[[sub_value]],{{( vcc,)?}} v[[sub_value]], v{{[0-9]+}} -; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v[[sub_value]], 63 +; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: buffer_atomic_sub v[[value]] define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { Index: test/CodeGen/AMDGPU/fix-wwm-liveness.mir =================================================================== --- test/CodeGen/AMDGPU/fix-wwm-liveness.mir +++ /dev/null @@ -1,185 +0,0 @@ -# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-fix-wwm-liveness -o - %s | FileCheck %s - -# Test a then phi value. -#CHECK: test_wwm_liveness_then_phi -#CHECK: %21:vgpr_32 = V_MOV_B32_e32 1, implicit $exec, implicit %21 - ---- -name: test_wwm_liveness_then_phi -alignment: 0 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -tracksRegLiveness: true -registers: - - { id: 0, class: sreg_64, preferred-register: '' } - - { id: 1, class: sgpr_32, preferred-register: '' } - - { id: 2, class: sgpr_32, preferred-register: '' } - - { id: 3, class: vgpr_32, preferred-register: '' } - - { id: 4, class: vgpr_32, preferred-register: '' } - - { id: 5, class: vgpr_32, preferred-register: '' } - - { id: 6, class: vgpr_32, preferred-register: '' } - - { id: 7, class: vgpr_32, preferred-register: '' } - - { id: 8, class: sreg_64, preferred-register: '$vcc' } - - { id: 9, class: sreg_64, preferred-register: '' } - - { id: 10, class: sreg_32_xm0, preferred-register: '' } - - { id: 11, class: sreg_64, preferred-register: '' } - - { id: 12, class: sreg_32_xm0, preferred-register: '' } - - { id: 13, class: sreg_32_xm0, preferred-register: '' } - - { id: 14, class: sreg_32_xm0, preferred-register: '' } - - { id: 15, class: sreg_128, preferred-register: '' } - - { id: 16, class: vgpr_32, preferred-register: '' } - - { id: 17, class: vgpr_32, preferred-register: '' } - - { id: 18, class: vgpr_32, preferred-register: '' } - - { id: 19, class: sreg_64, preferred-register: '' } - - { id: 20, class: sreg_64, preferred-register: '' } - - { id: 21, class: vgpr_32, preferred-register: '' } - - { id: 22, class: sreg_64, preferred-register: '' } - - { id: 23, class: sreg_64, preferred-register: '' } -liveins: -body: | - bb.0: - successors: %bb.1(0x40000000), %bb.2(0x40000000) - - %21 = V_MOV_B32_e32 0, implicit $exec - %5 = V_MBCNT_LO_U32_B32_e64 -1, 0, implicit $exec - %6 = V_MBCNT_HI_U32_B32_e32 -1, killed %5, implicit $exec - %8 = V_CMP_GT_U32_e64 32, killed %6, implicit $exec - %22 = COPY $exec, implicit-def $exec - %23 = S_AND_B64 %22, %8, implicit-def dead $scc - %0 = S_XOR_B64 %23, %22, implicit-def dead $scc - $exec = S_MOV_B64_term killed %23 - SI_MASK_BRANCH %bb.2, implicit $exec - S_BRANCH %bb.1 - - bb.1: - successors: %bb.2(0x80000000) - - %13 = S_MOV_B32 61440 - %14 = S_MOV_B32 -1 - %15 = REG_SEQUENCE undef %12, 1, undef %10, 2, killed %14, 3, killed %13, 4 - %19 = COPY $exec - $exec = S_MOV_B64 -1 - %16 = BUFFER_LOAD_DWORD_OFFSET %15, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4) - %17 = V_ADD_F32_e32 1065353216, killed %16, implicit $exec - $exec = EXIT_WWM killed %19 - %21 = V_MOV_B32_e32 1, implicit $exec - early-clobber %18 = WWM killed %17, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed %18, killed %15, 0, 0, 0, 0, 0, implicit $exec :: (store 4) - - bb.2: - $exec = S_OR_B64 $exec, killed %0, implicit-def $scc - $vgpr0 = COPY killed %21 - SI_RETURN_TO_EPILOG killed $vgpr0 - -... - -# Test a loop with a loop exit value and a loop phi. -#CHECK: test_wwm_liveness_loop -#CHECK: %4:vgpr_32 = IMPLICIT_DEF -#CHECK: bb.1: -#CHECK: %4:vgpr_32 = FLAT_LOAD_DWORD{{.*}}, implicit %4 -#CHECK: %27:vgpr_32 = COPY killed %21, implicit %27 - ---- -name: test_wwm_liveness_loop -alignment: 0 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -failedISel: false -tracksRegLiveness: true -registers: - - { id: 0, class: vgpr_32, preferred-register: '' } - - { id: 1, class: sreg_32_xm0, preferred-register: '' } - - { id: 2, class: sreg_64, preferred-register: '' } - - { id: 3, class: sreg_32_xm0, preferred-register: '' } - - { id: 4, class: vgpr_32, preferred-register: '' } - - { id: 5, class: sreg_32_xm0, preferred-register: '' } - - { id: 6, class: sreg_64, preferred-register: '' } - - { id: 7, class: sreg_64, preferred-register: '' } - - { id: 8, class: sreg_64, preferred-register: '' } - - { id: 9, class: vreg_64, preferred-register: '' } - - { id: 10, class: vgpr_32, preferred-register: '' } - - { id: 11, class: vgpr_32, preferred-register: '' } - - { id: 12, class: vgpr_32, preferred-register: '' } - - { id: 13, class: sreg_64, preferred-register: '' } - - { id: 14, class: vreg_64, preferred-register: '' } - - { id: 15, class: sreg_32_xm0, preferred-register: '' } - - { id: 16, class: vgpr_32, preferred-register: '' } - - { id: 17, class: sreg_64, preferred-register: '$vcc' } - - { id: 18, class: vgpr_32, preferred-register: '' } - - { id: 19, class: vgpr_32, preferred-register: '' } - - { id: 20, class: vgpr_32, preferred-register: '' } - - { id: 21, class: vgpr_32, preferred-register: '' } - - { id: 22, class: vgpr_32, preferred-register: '' } - - { id: 23, class: sreg_64, preferred-register: '' } - - { id: 24, class: sreg_64, preferred-register: '' } - - { id: 25, class: sreg_64, preferred-register: '' } - - { id: 26, class: sreg_64, preferred-register: '' } - - { id: 27, class: vgpr_32, preferred-register: '' } -liveins: -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 0 - adjustsStack: false - hasCalls: false - stackProtector: '' - maxCallFrameSize: 4294967295 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false - localFrameSize: 0 - savePoint: '' - restorePoint: '' -fixedStack: -stack: -constants: -body: | - bb.0: - successors: %bb.1(0x80000000) - - %25:sreg_64 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - %0:vgpr_32 = FLAT_LOAD_DWORD undef %9:vreg_64, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* undef`, addrspace 1) - $exec = EXIT_WWM killed %25 - %12:vgpr_32 = V_MBCNT_LO_U32_B32_e64 -1, 0, implicit $exec - %7:sreg_64 = S_MOV_B64 0 - %26:sreg_64 = COPY killed %7 - %27:vgpr_32 = COPY killed %12 - - bb.1: - successors: %bb.2(0x04000000), %bb.1(0x7c000000) - - %24:sreg_64 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - %20:vgpr_32 = COPY killed %27 - %2:sreg_64 = COPY killed %26 - %4:vgpr_32 = FLAT_LOAD_DWORD undef %14:vreg_64, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* undef`, addrspace 1) - $exec = EXIT_WWM killed %24 - %22:vgpr_32 = V_ADD_I32_e32 -1, killed %20, implicit-def dead $vcc, implicit $exec - %17:sreg_64 = V_CMP_EQ_U32_e64 0, %22, implicit $exec - %6:sreg_64 = S_OR_B64 killed %17, killed %2, implicit-def $scc - %21:vgpr_32 = COPY killed %22 - %26:sreg_64 = COPY %6 - %27:vgpr_32 = COPY killed %21 - $exec = S_ANDN2_B64_term $exec, %6 - S_CBRANCH_EXECNZ %bb.1, implicit $exec - S_BRANCH %bb.2 - - bb.2: - $exec = S_OR_B64 $exec, killed %6, implicit-def $scc - %23:sreg_64 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - %18:vgpr_32 = V_ADD_F32_e32 killed %0, killed %4, implicit $exec - $exec = EXIT_WWM killed %23 - early-clobber %19:vgpr_32 = COPY killed %18, implicit $exec - $vgpr0 = COPY killed %19 - SI_RETURN_TO_EPILOG killed $vgpr0 - -... - Index: test/CodeGen/AMDGPU/indirect-addressing-term.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-addressing-term.ll +++ test/CodeGen/AMDGPU/indirect-addressing-term.ll @@ -81,28 +81,25 @@ ; GCN: bb.1: ; GCN: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; GCN: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3, implicit-def dead $m0 :: (load 8 from %stack.5, align 4, addrspace 5) - ; GCN: $vgpr0 = SI_SPILL_V32_RESTORE %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.4, addrspace 5) - ; GCN: $vgpr1 = SI_SPILL_V32_RESTORE %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) - ; GCN: renamable $sgpr2 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec - ; GCN: renamable $sgpr4_sgpr5 = V_CMP_EQ_U32_e64 $sgpr2, killed $vgpr1, implicit $exec - ; GCN: renamable $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 killed renamable $sgpr4_sgpr5, implicit-def $exec, implicit-def $scc, implicit $exec + ; GCN: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + ; GCN: renamable $sgpr2 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec + ; GCN: renamable $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr2, killed $vgpr0, implicit $exec + ; GCN: renamable $sgpr0_sgpr1 = S_AND_SAVEEXEC_B64 killed renamable $sgpr0_sgpr1, implicit-def $exec, implicit-def $scc, implicit $exec ; GCN: S_SET_GPR_IDX_ON killed renamable $sgpr2, 1, implicit-def $m0, implicit undef $m0 - ; GCN: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = SI_SPILL_V512_RESTORE %stack.2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 64 from %stack.2, align 4, addrspace 5) - ; GCN: renamable $vgpr18 = V_MOV_B32_e32 undef $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, implicit $m0 + ; GCN: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 = SI_SPILL_V512_RESTORE %stack.2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 64 from %stack.2, align 4, addrspace 5) + ; GCN: renamable $vgpr17 = V_MOV_B32_e32 undef $vgpr2, implicit $exec, implicit killed $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16, implicit $m0 ; GCN: S_SET_GPR_IDX_OFF - ; GCN: renamable $vgpr19 = COPY renamable $vgpr18 - ; GCN: renamable $sgpr6_sgpr7 = COPY renamable $sgpr4_sgpr5 - ; GCN: SI_SPILL_S64_SAVE killed $sgpr6_sgpr7, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3, implicit-def dead $m0 :: (store 8 into %stack.5, align 4, addrspace 5) - ; GCN: SI_SPILL_S64_SAVE killed $sgpr0_sgpr1, %stack.6, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3, implicit-def dead $m0 :: (store 8 into %stack.6, align 4, addrspace 5) - ; GCN: SI_SPILL_V32_SAVE killed $vgpr19, %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5) - ; GCN: SI_SPILL_V32_SAVE killed $vgpr0, %stack.7, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.7, addrspace 5) - ; GCN: SI_SPILL_V32_SAVE killed $vgpr18, %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.8, addrspace 5) - ; GCN: $exec = S_XOR_B64_term $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc + ; GCN: renamable $vgpr18 = COPY renamable $vgpr17 + ; GCN: renamable $sgpr4_sgpr5 = COPY renamable $sgpr0_sgpr1 + ; GCN: SI_SPILL_S64_SAVE killed $sgpr4_sgpr5, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3, implicit-def dead $m0 :: (store 8 into %stack.5, align 4, addrspace 5) + ; GCN: SI_SPILL_V32_SAVE killed $vgpr18, %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5) + ; GCN: SI_SPILL_V32_SAVE killed $vgpr17, %stack.6, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.6, addrspace 5) + ; GCN: $exec = S_XOR_B64_term $exec, killed renamable $sgpr0_sgpr1, implicit-def $scc ; GCN: S_CBRANCH_EXECNZ %bb.1, implicit $exec ; GCN: bb.2: ; GCN: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3, implicit-def dead $m0 :: (load 8 from %stack.3, align 4, addrspace 5) ; GCN: $exec = S_MOV_B64 killed renamable $sgpr0_sgpr1 - ; GCN: $vgpr0 = SI_SPILL_V32_RESTORE %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.8, addrspace 5) + ; GCN: $vgpr0 = SI_SPILL_V32_RESTORE %stack.6, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.6, addrspace 5) ; GCN: $sgpr4_sgpr5_sgpr6_sgpr7 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3, implicit-def dead $m0 :: (load 16 from %stack.1, align 4, addrspace 5) ; GCN: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr0, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.load, addrspace 1) ; GCN: S_ENDPGM Index: test/CodeGen/AMDGPU/wqm.mir =================================================================== --- test/CodeGen/AMDGPU/wqm.mir +++ test/CodeGen/AMDGPU/wqm.mir @@ -3,7 +3,7 @@ --- # Check for awareness that s_or_saveexec_b64 clobbers SCC # -#CHECK: S_OR_SAVEEXEC_B64 +#CHECK: ENTER_WWM #CHECK: S_CMP_LT_I32 #CHECK: S_CSELECT_B32 name: test_wwm_scc Index: test/CodeGen/AMDGPU/wwm-reserved.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/wwm-reserved.ll @@ -0,0 +1,166 @@ +; RUN: llc -O0 -march=amdgcn -mcpu=gfx900 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX9,GFX9-O0 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX9,GFX9-O3 %s + +define amdgpu_cs void @no_cfg(<4 x i32> inreg %tmp14) { + %tmp100 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %tmp14, i32 0, i32 0, i32 0) + %tmp101 = bitcast <2 x float> %tmp100 to <2 x i32> + %tmp102 = extractelement <2 x i32> %tmp101, i32 0 + %tmp103 = extractelement <2 x i32> %tmp101, i32 1 + %tmp105 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp102, i32 0) + %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp103, i32 0) + %tmp108 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 273, i32 15, i32 15, i1 false) + %tmp109 = add i32 %tmp108, %tmp105 + %tmp110 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 274, i32 15, i32 15, i1 false) + %tmp111 = add i32 %tmp109, %tmp110 + %tmp112 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 275, i32 15, i32 15, i1 false) + %tmp113 = add i32 %tmp111, %tmp112 + %tmp114 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp113, i32 276, i32 15, i32 14, i1 false) + %tmp115 = add i32 %tmp113, %tmp114 + %tmp116 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp115, i32 280, i32 15, i32 12, i1 false) + %tmp117 = add i32 %tmp115, %tmp116 + %tmp118 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp117, i32 322, i32 10, i32 15, i1 false) + %tmp119 = add i32 %tmp117, %tmp118 + %tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp119, i32 323, i32 12, i32 15, i1 false) + %tmp121 = add i32 %tmp119, %tmp120 + %tmp122 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp121) +; GFX9: v_mov_b32_dpp v[[FIRST_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9: v_add_u32_e32 v[[FIRST_ADD:[0-9]+]], v{{[0-9]+}}, v[[FIRST_MOV]] +; GFX9: v_mov_b32_e32 v[[FIRST:[0-9]+]], v[[FIRST_ADD]] + + %tmp123 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 273, i32 15, i32 15, i1 false) + %tmp124 = add i32 %tmp123, %tmp107 + %tmp125 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 274, i32 15, i32 15, i1 false) + %tmp126 = add i32 %tmp124, %tmp125 + %tmp127 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 275, i32 15, i32 15, i1 false) + %tmp128 = add i32 %tmp126, %tmp127 + %tmp129 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp128, i32 276, i32 15, i32 14, i1 false) + %tmp130 = add i32 %tmp128, %tmp129 + %tmp131 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp130, i32 280, i32 15, i32 12, i1 false) + %tmp132 = add i32 %tmp130, %tmp131 + %tmp133 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp132, i32 322, i32 10, i32 15, i1 false) + %tmp134 = add i32 %tmp132, %tmp133 + %tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp134, i32 323, i32 12, i32 15, i1 false) + %tmp136 = add i32 %tmp134, %tmp135 + %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136) +; GFX9: v_mov_b32_dpp v[[SECOND_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9: v_add_u32_e32 v[[SECOND_ADD:[0-9]+]], v{{[0-9]+}}, v[[SECOND_MOV]] +; GFX9: v_mov_b32_e32 v[[SECOND:[0-9]+]], v[[SECOND_ADD]] + +; GFX9-O3: v_cmp_eq_u32_e32 vcc, v[[FIRST]], v[[SECOND]] +; GFX9-O0: v_cmp_eq_u32_e64 s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v[[FIRST]], v[[SECOND]] + %tmp138 = icmp eq i32 %tmp122, %tmp137 + %tmp139 = sext i1 %tmp138 to i32 + %tmp140 = shl nsw i32 %tmp139, 1 + %tmp141 = and i32 %tmp140, 2 + %tmp145 = bitcast i32 %tmp141 to float + call void @llvm.amdgcn.raw.buffer.store.f32(float %tmp145, <4 x i32> %tmp14, i32 4, i32 0, i32 0) + ret void +} + +define amdgpu_cs void @cfg(<4 x i32> inreg %tmp14, i32 %arg) { +entry: + %tmp100 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %tmp14, i32 0, i32 0, i32 0) + %tmp101 = bitcast <2 x float> %tmp100 to <2 x i32> + %tmp102 = extractelement <2 x i32> %tmp101, i32 0 + %tmp105 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp102, i32 0) + %tmp108 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 273, i32 15, i32 15, i1 false) + %tmp109 = add i32 %tmp108, %tmp105 + %tmp110 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 274, i32 15, i32 15, i1 false) + %tmp111 = add i32 %tmp109, %tmp110 + %tmp112 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 275, i32 15, i32 15, i1 false) + %tmp113 = add i32 %tmp111, %tmp112 + %tmp114 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp113, i32 276, i32 15, i32 14, i1 false) + %tmp115 = add i32 %tmp113, %tmp114 + %tmp116 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp115, i32 280, i32 15, i32 12, i1 false) + %tmp117 = add i32 %tmp115, %tmp116 + %tmp118 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp117, i32 322, i32 10, i32 15, i1 false) + %tmp119 = add i32 %tmp117, %tmp118 + + +; GFX9: v_mov_b32_dpp v[[FIRST_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9: v_add_u32_e32 v[[FIRST_ADD:[0-9]+]], v{{[0-9]+}}, v[[FIRST_MOV]] +; GFX9: v_mov_b32_e32 v[[FIRST:[0-9]+]], v[[FIRST_ADD]] +; GFX9-O0: buffer_store_dword v[[FIRST]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, s[[FIRST_SGPR_OFFSET:[0-9]+]] offset:[[FIRST_IMM_OFFSET:[0-9]+]] + %tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp119, i32 323, i32 12, i32 15, i1 false) + %tmp121 = add i32 %tmp119, %tmp120 + %tmp122 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp121) + + %cond = icmp eq i32 %arg, 0 + br i1 %cond, label %if, label %merge +if: + %tmp103 = extractelement <2 x i32> %tmp101, i32 1 + %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp103, i32 0) + %tmp123 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 273, i32 15, i32 15, i1 false) + %tmp124 = add i32 %tmp123, %tmp107 + %tmp125 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 274, i32 15, i32 15, i1 false) + %tmp126 = add i32 %tmp124, %tmp125 + %tmp127 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 275, i32 15, i32 15, i1 false) + %tmp128 = add i32 %tmp126, %tmp127 + %tmp129 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp128, i32 276, i32 15, i32 14, i1 false) + %tmp130 = add i32 %tmp128, %tmp129 + %tmp131 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp130, i32 280, i32 15, i32 12, i1 false) + %tmp132 = add i32 %tmp130, %tmp131 + %tmp133 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp132, i32 322, i32 10, i32 15, i1 false) + %tmp134 = add i32 %tmp132, %tmp133 + +; GFX9: v_mov_b32_dpp v[[SECOND_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9: v_add_u32_e32 v[[SECOND_ADD:[0-9]+]], v{{[0-9]+}}, v[[SECOND_MOV]] +; GFX9: v_mov_b32_e32 v[[SECOND:[0-9]+]], v[[SECOND_ADD]] +; GFX9-O0: buffer_store_dword v[[SECOND]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, s[[SECOND_SGPR_OFFSET:[0-9]+]] offset:[[SECOND_IMM_OFFSET:[0-9]+]] + %tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp134, i32 323, i32 12, i32 15, i1 false) + %tmp136 = add i32 %tmp134, %tmp135 + %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136) + br label %merge + +merge: + %merge_value = phi i32 [ 0, %entry ], [%tmp137, %if ] +; GFX9-O3: v_cmp_eq_u32_e32 vcc, v[[FIRST]], v[[SECOND]] +; GFX9-O0: buffer_load_dword v[[SECOND:[0-9]+]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, s[[SECOND_SGPR_OFFSET]] offset:[[SECOND_IMM_OFFSET]] +; GFX9-O0: buffer_load_dword v[[FIRST:[0-9]+]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, s[[FIRST_SGPR_OFFSET]] offset:[[FIRST_IMM_OFFSET]] +; GFX9-O0: v_cmp_eq_u32_e64 s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v[[FIRST]], v[[SECOND]] + %tmp138 = icmp eq i32 %tmp122, %merge_value + %tmp139 = sext i1 %tmp138 to i32 + %tmp140 = shl nsw i32 %tmp139, 1 + %tmp141 = and i32 %tmp140, 2 + %tmp145 = bitcast i32 %tmp141 to float + call void @llvm.amdgcn.raw.buffer.store.f32(float %tmp145, <4 x i32> %tmp14, i32 4, i32 0, i32 0) + ret void +} + +define i32 @called(i32 %a) noinline { +; GFX9: v_add_u32_e32 v1, v0, v0 + %add = add i32 %a, %a +; GFX9: v_mul_lo_i32 v0, v1, v0 + %mul = mul i32 %add, %a +; GFX9: v_sub_u32_e32 v0, v0, v1 + %sub = sub i32 %mul, %add + ret i32 %sub +} + +define amdgpu_kernel void @call(<4 x i32> inreg %tmp14, i32 inreg %arg) { +; GFX9-O0: v_mov_b32_e32 v2, v0 +; GFX9-O3: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_not_b64 exec, exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_not_b64 exec, exec + %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %arg, i32 0) +; GFX9: v_mov_b32_e32 v0, v2 +; GFX9: s_waitcnt lgkmcnt(0) +; GFX9: s_swappc_b64 + %tmp134 = call i32 @called(i32 %tmp107) +; GFX9: v_mov_b32_e32 v1, v0 +; GFX9: v_add_u32_e32 v1, v1, v2 + %tmp136 = add i32 %tmp134, %tmp107 + %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136) +; GFX9: buffer_store_dword v0 + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %tmp137, <4 x i32> %tmp14, i32 4, i32 0, i32 0) + ret void +} + +declare i32 @llvm.amdgcn.wwm.i32(i32) +declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) +declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) +declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32)