Index: lib/Target/AMDGPU/AMDGPU.h
===================================================================
--- lib/Target/AMDGPU/AMDGPU.h
+++ lib/Target/AMDGPU/AMDGPU.h
@@ -51,7 +51,7 @@
 FunctionPass *createSIFixSGPRCopiesPass();
 FunctionPass *createSIMemoryLegalizerPass();
 FunctionPass *createSIInsertWaitcntsPass();
-FunctionPass *createSIFixWWMLivenessPass();
+FunctionPass *createSIPreAllocateWWMRegsPass();
 FunctionPass *createSIFormMemoryClausesPass();
 FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &);
 FunctionPass *createAMDGPUUseNativeCallsPass();
@@ -148,8 +148,8 @@
 void initializeSIOptimizeExecMaskingPass(PassRegistry &);
 extern char &SIOptimizeExecMaskingID;
 
-void initializeSIFixWWMLivenessPass(PassRegistry &);
-extern char &SIFixWWMLivenessID;
+void initializeSIPreAllocateWWMRegsPass(PassRegistry &);
+extern char &SIPreAllocateWWMRegsID;
 
 void initializeAMDGPUSimplifyLibCallsPass(PassRegistry &);
 extern char &AMDGPUSimplifyLibCallsID;
Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -201,7 +201,7 @@
   initializeSIInsertSkipsPass(*PR);
   initializeSIMemoryLegalizerPass(*PR);
   initializeSIOptimizeExecMaskingPass(*PR);
-  initializeSIFixWWMLivenessPass(*PR);
+  initializeSIPreAllocateWWMRegsPass(*PR);
   initializeSIFormMemoryClausesPass(*PR);
   initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
   initializeAMDGPUAAWrapperPassPass(*PR);
@@ -870,9 +870,9 @@
   // SI_ELSE will introduce a copy of the tied operand source after the else.
   insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
 
-  // This must be run after SILowerControlFlow, since it needs to use the
-  // machine-level CFG, but before register allocation.
-  insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
+  // This must be run just before RegisterCoalescing, which runs just after
+  // TwoAddressInstructions.
+  insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID, false);
 
   TargetPassConfig::addFastRegAlloc(RegAllocPass);
 }
@@ -887,9 +887,9 @@
   // SI_ELSE will introduce a copy of the tied operand source after the else.
   insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
 
-  // This must be run after SILowerControlFlow, since it needs to use the
-  // machine-level CFG, but before register allocation.
-  insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
+  // This must be run just before RegisterCoalescing, which runs just after
+  // TwoAddressInstructions.
+  insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID, false);
 
   TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
 }
Index: lib/Target/AMDGPU/CMakeLists.txt
===================================================================
--- lib/Target/AMDGPU/CMakeLists.txt
+++ lib/Target/AMDGPU/CMakeLists.txt
@@ -95,7 +95,7 @@
   SIFixSGPRCopies.cpp
   SIFixupVectorISel.cpp
   SIFixVGPRCopies.cpp
-  SIFixWWMLiveness.cpp
+  SIPreAllocateWWMRegs.cpp
   SIFoldOperands.cpp
   SIFormMemoryClauses.cpp
   SIFrameLowering.cpp
Index: lib/Target/AMDGPU/SIFixWWMLiveness.cpp
===================================================================
--- lib/Target/AMDGPU/SIFixWWMLiveness.cpp
+++ /dev/null
@@ -1,417 +0,0 @@
-//===-- SIFixWWMLiveness.cpp - Fix WWM live intervals ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// Computations in WWM can overwrite values in inactive channels for
-/// variables that the register allocator thinks are dead. This pass adds fake
-/// uses of those variables to their def(s) to make sure that they aren't
-/// overwritten.
-///
-/// As an example, consider this snippet:
-/// %vgpr0 = V_MOV_B32_e32 0.0
-/// if (...) {
-///   %vgpr1 = ...
-///   %vgpr2 = WWM killed %vgpr1
-///   ... = killed %vgpr2
-///   %vgpr0 = V_MOV_B32_e32 1.0
-/// }
-/// ... = %vgpr0
-///
-/// The live intervals of %vgpr0 don't overlap with those of %vgpr1. Normally,
-/// we can safely allocate %vgpr0 and %vgpr1 in the same register, since
-/// writing %vgpr1 would only write to channels that would be clobbered by the
-/// second write to %vgpr0 anyways. But if %vgpr1 is written with WWM enabled,
-/// it would clobber even the inactive channels for which the if-condition is
-/// false, for which %vgpr0 is supposed to be 0. This pass adds an implicit use
-/// of %vgpr0 to its def to make sure they aren't allocated to the
-/// same register.
-///
-/// In general, we need to figure out what registers might have their inactive
-/// channels which are eventually used accidentally clobbered by a WWM
-/// instruction. We do that by spotting three separate cases of registers:
-///
-/// 1. A "then phi": the value resulting from phi elimination of a phi node at
-///    the end of an if..endif. If there is WWM code in the "then", then we
-///    make the def at the end of the "then" branch a partial def by adding an
-///    implicit use of the register.
-///
-/// 2. A "loop exit register": a value written inside a loop but used outside the
-///    loop, where there is WWM code inside the loop (the case in the example
-///    above). We add an implicit_def of the register in the loop pre-header,
-///    and make the original def a partial def by adding an implicit use of the
-///    register.
-///
-/// 3. A "loop exit phi": the value resulting from phi elimination of a phi node
-///    in a loop header. If there is WWM code inside the loop, then we make all
-///    defs inside the loop partial defs by adding an implicit use of the
-///    register on each one.
-///
-/// Note that we do not need to consider an if..else..endif phi. We only need to
-/// consider non-uniform control flow, and control flow structurization would
-/// have transformed a non-uniform if..else..endif into two if..endifs.
-///
-/// The analysis to detect these cases relies on a property of the MIR
-/// arising from this pass running straight after PHIElimination and before any
-/// coalescing: that any virtual register with more than one definition must be
-/// the new register added to lower a phi node by PHIElimination.
-///
-/// FIXME: We should detect whether a register in one of the above categories is
-/// already live at the WWM code before deciding to add the implicit uses to
-/// synthesize its liveness.
-///
-/// FIXME: I believe this whole scheme may be flawed due to the possibility of
-/// the register allocator doing live interval splitting.
-///
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SparseBitVector.h"
-#include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "si-fix-wwm-liveness"
-
-namespace {
-
-class SIFixWWMLiveness : public MachineFunctionPass {
-private:
-  MachineDominatorTree *DomTree;
-  MachineLoopInfo *LoopInfo;
-  LiveIntervals *LIS = nullptr;
-  const SIInstrInfo *TII;
-  const SIRegisterInfo *TRI;
-  MachineRegisterInfo *MRI;
-
-  std::vector<MachineInstr *> WWMs;
-  std::vector<MachineOperand *> ThenDefs;
-  std::vector<std::pair<MachineOperand *, MachineLoop *>> LoopExitDefs;
-  std::vector<std::pair<MachineOperand *, MachineLoop *>> LoopPhiDefs;
-
-public:
-  static char ID;
-
-  SIFixWWMLiveness() : MachineFunctionPass(ID) {
-    initializeSIFixWWMLivenessPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  StringRef getPassName() const override { return "SI Fix WWM Liveness"; }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequiredID(MachineDominatorsID);
-    AU.addRequiredID(MachineLoopInfoID);
-    // Should preserve the same set that TwoAddressInstructions does.
-    AU.addPreserved<SlotIndexes>();
-    AU.addPreserved<LiveIntervals>();
-    AU.addPreservedID(LiveVariablesID);
-    AU.addPreservedID(MachineLoopInfoID);
-    AU.addPreservedID(MachineDominatorsID);
-    AU.setPreservesCFG();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-
-private:
-  void processDef(MachineOperand &DefOpnd);
-  bool processThenDef(MachineOperand *DefOpnd);
-  bool processLoopExitDef(MachineOperand *DefOpnd, MachineLoop *Loop);
-  bool processLoopPhiDef(MachineOperand *DefOpnd, MachineLoop *Loop);
-};
-
-} // End anonymous namespace.
-
-INITIALIZE_PASS_BEGIN(SIFixWWMLiveness, DEBUG_TYPE,
-                "SI fix WWM liveness", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_END(SIFixWWMLiveness, DEBUG_TYPE,
-                "SI fix WWM liveness", false, false)
-
-char SIFixWWMLiveness::ID = 0;
-
-char &llvm::SIFixWWMLivenessID = SIFixWWMLiveness::ID;
-
-FunctionPass *llvm::createSIFixWWMLivenessPass() {
-  return new SIFixWWMLiveness();
-}
-
-bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) {
-  LLVM_DEBUG(dbgs() << "SIFixWWMLiveness: function " << MF.getName() << "\n");
-  bool Modified = false;
-
-  // This doesn't actually need LiveIntervals, but we can preserve them.
-  LIS = getAnalysisIfAvailable<LiveIntervals>();
-
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-
-  TII = ST.getInstrInfo();
-  TRI = &TII->getRegisterInfo();
-  MRI = &MF.getRegInfo();
-
-  DomTree = &getAnalysis<MachineDominatorTree>();
-  LoopInfo = &getAnalysis<MachineLoopInfo>();
-
-  // Scan the function to find the WWM sections and the candidate registers for
-  // having liveness modified.
-  for (MachineBasicBlock &MBB : MF) {
-    for (MachineInstr &MI : MBB) {
-      if (MI.getOpcode() == AMDGPU::EXIT_WWM)
-        WWMs.push_back(&MI);
-      else {
-        for (MachineOperand &DefOpnd : MI.defs()) {
-          if (DefOpnd.isReg()) {
-            unsigned Reg = DefOpnd.getReg();
-            if (TRI->isVGPR(*MRI, Reg))
-              processDef(DefOpnd);
-          }
-        }
-      }
-    }
-  }
-  if (!WWMs.empty()) {
-    // Synthesize liveness over WWM sections as required.
-    for (auto ThenDef : ThenDefs)
-      Modified |= processThenDef(ThenDef);
-    for (auto LoopExitDef : LoopExitDefs)
-      Modified |= processLoopExitDef(LoopExitDef.first, LoopExitDef.second);
-    for (auto LoopPhiDef : LoopPhiDefs)
-      Modified |= processLoopPhiDef(LoopPhiDef.first, LoopPhiDef.second);
-  }
-
-  WWMs.clear();
-  ThenDefs.clear();
-  LoopExitDefs.clear();
-  LoopPhiDefs.clear();
-
-  return Modified;
-}
-
-// During the function scan, process an operand that defines a VGPR.
-// This categorizes the register and puts it in the appropriate list for later
-// use when processing a WWM section.
-void SIFixWWMLiveness::processDef(MachineOperand &DefOpnd) {
-  unsigned Reg = DefOpnd.getReg();
-  // Get all the defining instructions. For convenience, make Defs[0] the def
-  // we are on now.
-  SmallVector<const MachineInstr *, 4> Defs;
-  Defs.push_back(DefOpnd.getParent());
-  for (auto &MI : MRI->def_instructions(Reg)) {
-    if (&MI != DefOpnd.getParent())
-      Defs.push_back(&MI);
-  }
-  // Check whether this def dominates all the others. If not, ignore this def.
-  // Either it is going to be processed when the scan encounters its other def
-  // that dominates all defs, or there is no def that dominates all others.
-  // The latter case is an eliminated phi from an if..else..endif or similar,
-  // which must be for uniform control flow so can be ignored.
-  // Because this pass runs shortly after PHIElimination, we assume that any
-  // multi-def register is a lowered phi, and thus has each def in a separate
-  // basic block.
-  for (unsigned I = 1; I != Defs.size(); ++I) {
-    if (!DomTree->dominates(Defs[0]->getParent(), Defs[I]->getParent()))
-      return;
-  }
-  // Check for the case of an if..endif lowered phi: It has two defs, one
-  // dominates the other, and there is a single use in a successor of the
-  // dominant def.
-  // Later we will spot any WWM code inside
-  // the "then" clause and turn the second def into a partial def so its
-  // liveness goes through the WWM code in the "then" clause.
-  if (Defs.size() == 2) {
-    auto DomDefBlock = Defs[0]->getParent();
-    if (DomDefBlock->succ_size() == 2 && MRI->hasOneUse(Reg)) {
-      auto UseBlock = MRI->use_begin(Reg)->getParent()->getParent();
-      for (auto Succ : DomDefBlock->successors()) {
-        if (Succ == UseBlock) {
-          LLVM_DEBUG(dbgs() << printReg(Reg, TRI) << " is a then phi reg\n");
-          ThenDefs.push_back(&DefOpnd);
-          return;
-        }
-      }
-    }
-  }
-  // Check for the case of a non-lowered-phi register (single def) that exits
-  // a loop, that is, it has a use that is outside a loop that the def is
-  // inside. We find the outermost loop that the def is inside but a use is
-  // outside. Later we will spot any WWM code inside that loop and then make
-  // the def a partial def so its liveness goes round the loop and through the
-  // WWM code.
-  if (Defs.size() == 1) {
-    auto Loop = LoopInfo->getLoopFor(Defs[0]->getParent());
-    if (!Loop)
-      return;
-    bool IsLoopExit = false;
-    for (auto &Use : MRI->use_instructions(Reg)) {
-      auto UseBlock = Use.getParent();
-      if (Loop->contains(UseBlock))
-        continue;
-      IsLoopExit = true;
-      while (auto Parent = Loop->getParentLoop()) {
-        if (Parent->contains(UseBlock))
-          break;
-        Loop = Parent;
-      }
-    }
-    if (!IsLoopExit)
-      return;
-    LLVM_DEBUG(dbgs() << printReg(Reg, TRI)
-        << " is a loop exit reg with loop header at "
-        << "bb." << Loop->getHeader()->getNumber() << "\n");
-    LoopExitDefs.push_back(std::pair<MachineOperand *, MachineLoop *>(
-            &DefOpnd, Loop));
-    return;
-  }
-  // Check for the case of a lowered single-preheader-loop phi, that is, a
-  // multi-def register where the dominating def is in the loop pre-header and
-  // all other defs are in backedges. Later we will spot any WWM code inside
-  // that loop and then make the backedge defs partial defs so the liveness
-  // goes through the WWM code.
-  // Note that we are ignoring multi-preheader loops on the basis that the
-  // structurizer does not allow that for non-uniform loops.
-  // There must be a single use in the loop header.
-  if (!MRI->hasOneUse(Reg))
-    return;
-  auto UseBlock = MRI->use_begin(Reg)->getParent()->getParent();
-  auto Loop = LoopInfo->getLoopFor(UseBlock);
-  if (!Loop || Loop->getHeader() != UseBlock
-      || Loop->contains(Defs[0]->getParent())) {
-    LLVM_DEBUG(dbgs() << printReg(Reg, TRI)
-        << " is multi-def but single use not in loop header\n");
-    return;
-  }
-  for (unsigned I = 1; I != Defs.size(); ++I) {
-    if (!Loop->contains(Defs[I]->getParent()))
-      return;
-  }
-  LLVM_DEBUG(dbgs() << printReg(Reg, TRI)
-      << " is a loop phi reg with loop header at "
-      << "bb." << Loop->getHeader()->getNumber() << "\n");
-  LoopPhiDefs.push_back(
-      std::pair<MachineOperand *, MachineLoop *>(&DefOpnd, Loop));
-}
-
-// Process a then phi def: It has two defs, one dominates the other, and there
-// is a single use in a successor of the dominant def. Here we spot any WWM
-// code inside the "then" clause and turn the second def into a partial def so
-// its liveness goes through the WWM code in the "then" clause.
-bool SIFixWWMLiveness::processThenDef(MachineOperand *DefOpnd) {
-  LLVM_DEBUG(dbgs() << "Processing then def: " << *DefOpnd->getParent());
-  if (DefOpnd->getParent()->getOpcode() == TargetOpcode::IMPLICIT_DEF) {
-    // Ignore if dominating def is undef.
-    LLVM_DEBUG(dbgs() << "  ignoring as dominating def is undef\n");
-    return false;
-  }
-  unsigned Reg = DefOpnd->getReg();
-  // Get the use block, which is the endif block.
-  auto UseBlock = MRI->use_instr_begin(Reg)->getParent();
-  // Check whether there is WWM code inside the then branch. The WWM code must
-  // be dominated by the if but not dominated by the endif.
-  bool ContainsWWM = false;
-  for (auto WWM : WWMs) {
-    if (DomTree->dominates(DefOpnd->getParent()->getParent(), WWM->getParent())
-        && !DomTree->dominates(UseBlock, WWM->getParent())) {
-      LLVM_DEBUG(dbgs() << "  contains WWM: " << *WWM);
-      ContainsWWM = true;
-      break;
-    }
-  }
-  if (!ContainsWWM)
-    return false;
-  // Get the other def.
-  MachineInstr *OtherDef = nullptr;
-  for (auto &MI : MRI->def_instructions(Reg)) {
-    if (&MI != DefOpnd->getParent())
-      OtherDef = &MI;
-  }
-  // Make it a partial def.
-  OtherDef->addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true));
-  LLVM_DEBUG(dbgs() << *OtherDef);
-  return true;
-}
-
-// Process a loop exit def, that is, a register with a single use in a loop
-// that has a use outside the loop.  Here we spot any WWM code inside that loop
-// and then make the def a partial def so its liveness goes round the loop and
-// through the WWM code.
-bool SIFixWWMLiveness::processLoopExitDef(MachineOperand *DefOpnd,
-      MachineLoop *Loop) {
-  LLVM_DEBUG(dbgs() << "Processing loop exit def: " << *DefOpnd->getParent());
-  // Check whether there is WWM code inside the loop.
-  bool ContainsWWM = false;
-  for (auto WWM : WWMs) {
-    if (Loop->contains(WWM->getParent())) {
-      LLVM_DEBUG(dbgs() << "  contains WWM: " << *WWM);
-      ContainsWWM = true;
-      break;
-    }
-  }
-  if (!ContainsWWM)
-    return false;
-  unsigned Reg = DefOpnd->getReg();
-  // Add a new implicit_def in loop preheader(s).
-  for (auto Pred : Loop->getHeader()->predecessors()) {
-    if (!Loop->contains(Pred)) {
-      auto ImplicitDef = BuildMI(*Pred, Pred->getFirstTerminator(), DebugLoc(),
-          TII->get(TargetOpcode::IMPLICIT_DEF), Reg);
-      LLVM_DEBUG(dbgs() << *ImplicitDef);
-      (void)ImplicitDef;
-    }
-  }
-  // Make the original def partial.
-  DefOpnd->getParent()->addOperand(MachineOperand::CreateReg(
-          Reg, false, /*isImp=*/true));
-  LLVM_DEBUG(dbgs() << *DefOpnd->getParent());
-  return true;
-}
-
-// Process a loop phi def, that is, a multi-def register where the dominating
-// def is in the loop pre-header and all other defs are in backedges. Here we
-// spot any WWM code inside that loop and then make the backedge defs partial
-// defs so the liveness goes through the WWM code.
-bool SIFixWWMLiveness::processLoopPhiDef(MachineOperand *DefOpnd,
-      MachineLoop *Loop) {
-  LLVM_DEBUG(dbgs() << "Processing loop phi def: " << *DefOpnd->getParent());
-  // Check whether there is WWM code inside the loop.
-  bool ContainsWWM = false;
-  for (auto WWM : WWMs) {
-    if (Loop->contains(WWM->getParent())) {
-      LLVM_DEBUG(dbgs() << "  contains WWM: " << *WWM);
-      ContainsWWM = true;
-      break;
-    }
-  }
-  if (!ContainsWWM)
-    return false;
-  unsigned Reg = DefOpnd->getReg();
-  // Remove kill mark from uses.
-  for (auto &Use : MRI->use_operands(Reg))
-    Use.setIsKill(false);
-  // Make all defs except the dominating one partial defs.
-  SmallVector<MachineInstr *, 4> Defs;
-  for (auto &Def : MRI->def_instructions(Reg))
-    Defs.push_back(&Def);
-  for (auto Def : Defs) {
-    if (DefOpnd->getParent() == Def)
-      continue;
-    Def->addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true));
-    LLVM_DEBUG(dbgs() << *Def);
-  }
-  return true;
-}
-
Index: lib/Target/AMDGPU/SIInstrInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.cpp
+++ lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1286,9 +1286,15 @@
     MI.eraseFromParent();
     break;
   }
+  case AMDGPU::ENTER_WWM: {
+    // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
+    // WWM is entered.
+    MI.setDesc(get(AMDGPU::S_OR_SAVEEXEC_B64));
+    break;
+  }
   case AMDGPU::EXIT_WWM: {
-    // This only gets its own opcode so that SIFixWWMLiveness can tell when WWM
-    // is exited.
+    // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
+    // WWM is exited.
     MI.setDesc(get(AMDGPU::S_MOV_B64));
     break;
   }
Index: lib/Target/AMDGPU/SIInstructions.td
===================================================================
--- lib/Target/AMDGPU/SIInstructions.td
+++ lib/Target/AMDGPU/SIInstructions.td
@@ -121,6 +121,13 @@
 
 } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
 
+def ENTER_WWM : SPseudoInstSI <(outs SReg_64:$sdst), (ins i64imm:$src0)> {
+  let Defs = [EXEC];
+  let hasSideEffects = 0;
+  let mayLoad = 0;
+  let mayStore = 0;
+}
+
 def EXIT_WWM : SPseudoInstSI <(outs SReg_64:$sdst), (ins SReg_64:$src0)> {
   let hasSideEffects = 0;
   let mayLoad = 0;
Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h
===================================================================
--- lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -15,13 +15,14 @@
 
 #include "AMDGPUArgumentUsageInfo.h"
 #include "AMDGPUMachineFunction.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseBitVector.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -206,6 +207,10 @@
     SGPRSpillVGPRCSR(unsigned V, Optional<int> F) : VGPR(V), FI(F) {}
   };
 
+  SparseBitVector<> WWMReservedRegs;
+
+  void ReserveWWMRegister(unsigned reg) { WWMReservedRegs.set(reg); }
+
 private:
   // SGPR->VGPR spilling support.
   using SpillRegMask = std::pair<unsigned, unsigned>;
Index: lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
===================================================================
--- /dev/null
+++ lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -0,0 +1,253 @@
+//===-- SIPreAllocateWWMRegs.cpp - WWM Register Pre-allocation ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-pre-allocate-wwm-regs"
+
+namespace {
+
+class SIPreAllocateWWMRegs : public MachineFunctionPass {
+private:
+  const SIInstrInfo *TII;
+  const SIRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+  LiveIntervals *LIS;
+  LiveRegMatrix *Matrix;
+  VirtRegMap *VRM;
+  RegisterClassInfo RegClassInfo;
+
+  std::vector<unsigned> RegsToRewrite;
+
+public:
+  static char ID;
+
+  SIPreAllocateWWMRegs() : MachineFunctionPass(ID) {
+    initializeSIPreAllocateWWMRegsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override { return "SI Pre-allocate WWM Registers"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LiveIntervals>();
+    AU.addPreserved<LiveIntervals>();
+    AU.addRequired<VirtRegMap>();
+    AU.addRequired<LiveRegMatrix>();
+    AU.addPreserved<SlotIndexes>();
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  bool processDef(MachineOperand &MO);
+  void rewriteRegs(MachineFunction &MF);
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SIPreAllocateWWMRegs, DEBUG_TYPE,
+                "SI Pre-allocate WWM Registers", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
+INITIALIZE_PASS_END(SIPreAllocateWWMRegs, DEBUG_TYPE,
+                "SI Pre-allocate WWM Registers", false, false)
+
+char SIPreAllocateWWMRegs::ID = 0;
+
+char &llvm::SIPreAllocateWWMRegsID = SIPreAllocateWWMRegs::ID;
+
+FunctionPass *llvm::createSIPreAllocateWWMRegsPass() {
+  return new SIPreAllocateWWMRegs();
+}
+
+bool SIPreAllocateWWMRegs::processDef(MachineOperand &MO) {
+  if (!MO.isReg())
+    return false;
+
+  unsigned Reg = MO.getReg();
+
+  if (!TRI->isVGPR(*MRI, Reg))
+    return false;
+
+  if (TRI->isPhysicalRegister(Reg))
+    return false;
+
+  if (VRM->hasPhys(Reg))
+    return false;
+
+  LiveInterval &LI = LIS->getInterval(Reg);
+
+  for (unsigned PhysReg : RegClassInfo.getOrder(MRI->getRegClass(Reg))) {
+    if (!MRI->isPhysRegUsed(PhysReg) &&
+        Matrix->checkInterference(LI, PhysReg) == LiveRegMatrix::IK_Free) {
+      Matrix->assign(LI, PhysReg);
+      assert(PhysReg != 0);
+      RegsToRewrite.push_back(Reg);
+      return true;
+    }
+  }
+
+  llvm_unreachable("physreg not found for WWM expression");
+  return false;
+}
+
+void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      for (MachineOperand &MO : MI.operands()) {
+        if (!MO.isReg())
+          continue;
+
+        const unsigned VirtReg = MO.getReg();
+        if (TRI->isPhysicalRegister(VirtReg))
+          continue;
+
+        if (!VRM->hasPhys(VirtReg))
+          continue;
+
+        unsigned PhysReg = VRM->getPhys(VirtReg);
+        const unsigned SubReg = MO.getSubReg();
+        if (SubReg != 0) {
+          PhysReg = TRI->getSubReg(PhysReg, SubReg);
+          MO.setSubReg(0);
+        }
+
+        MO.setReg(PhysReg);
+        MO.setIsRenamable(false);
+      }
+    }
+  }
+
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  for (unsigned Reg : RegsToRewrite) {
+    LIS->removeInterval(Reg);
+
+    const unsigned PhysReg = VRM->getPhys(Reg);
+    assert(PhysReg != 0);
+    MFI->ReserveWWMRegister(PhysReg);
+
+    // Need to turn any COPY's into MOV's when the source register is one of our
+    // physical registers.
+    const unsigned MovOp = TII->getMovOpcode(TRI->getPhysRegClass(PhysReg));
+
+    for (MachineOperand &MO : MRI->reg_operands(PhysReg)) {
+      MachineInstr &MI = *MO.getParent();
+
+      // Only check copies.
+      if (MI.getOpcode() != AMDGPU::COPY) {
+        continue;
+      }
+
+      assert(MI.getNumOperands() >= 2);
+
+      MachineOperand &Dst = MI.getOperand(0);
+
+      // If the destination is a physical register, skip.
+      if (!Dst.isReg() || TRI->isPhysicalRegister(Dst.getReg())) {
+        continue;
+      }
+
+      MachineOperand &Src = MI.getOperand(1);
+
+      // If the source wasn't our physical register, skip.
+      if (!Src.isReg() || (PhysReg != Src.getReg())) {
+        continue;
+      }
+
+      // Change the MI into a mov.
+      MI.setDesc(TII->get(MovOp));
+
+      // And make it implicitly depend on exec (like all VALU movs should do).
+      MI.addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
+    }
+  }
+
+  RegsToRewrite.clear();
+
+  // Update the set of reserved registers to include WWM ones.
+  MRI->freezeReservedRegs(MF);
+}
+
+bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << "SIPreAllocateWWMRegs: function " << MF.getName() << "\n");
+
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+
+  TII = ST.getInstrInfo();
+  TRI = &TII->getRegisterInfo();
+  MRI = &MF.getRegInfo();
+
+  LIS = &getAnalysis<LiveIntervals>();
+  Matrix = &getAnalysis<LiveRegMatrix>();
+  VRM = &getAnalysis<VirtRegMap>();
+
+  RegClassInfo.runOnMachineFunction(MF);
+
+  bool RegsAssigned = false;
+
+  // We use a reverse post-order traversal of the control-flow graph to
+  // guarantee that we visit definitions in dominance order. Since WWM
+  // expressions are guaranteed to never involve phi nodes, and we can only
+  // escape WWM through the special WWM instruction, this means that this is a
+  // perfect elimination order, so we can never do any better.
+  ReversePostOrderTraversal<MachineFunction*> RPOT(&MF);
+
+  for (MachineBasicBlock *MBB : RPOT) {
+    bool InWWM = false;
+    for (MachineInstr &MI : *MBB) {
+      if (MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32)
+        RegsAssigned |= processDef(MI.getOperand(0));
+
+      if (MI.getOpcode() == AMDGPU::ENTER_WWM) {
+        LLVM_DEBUG(dbgs() << "entering WWM region: " << MI << "\n");
+        InWWM = true;
+        continue;
+      }
+
+      if (MI.getOpcode() == AMDGPU::EXIT_WWM) {
+        LLVM_DEBUG(dbgs() << "exiting WWM region: " << MI << "\n");
+        InWWM = false;
+      }
+
+      if (!InWWM)
+        continue;
+
+      LLVM_DEBUG(dbgs() << "processing " << MI << "\n");
+
+      for (MachineOperand &DefOpnd : MI.defs()) {
+        RegsAssigned |= processDef(DefOpnd);
+      }
+    }
+  }
+
+  if (!RegsAssigned)
+    return false;
+
+  rewriteRegs(MF);
+  return true;
+}
Index: lib/Target/AMDGPU/SIRegisterInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -227,6 +227,10 @@
     assert(!isSubRegister(ScratchRSrcReg, FrameReg));
   }
 
+  for (unsigned Reg : MFI->WWMReservedRegs) {
+    reserveRegisterTuples(Reserved, Reg);
+  }
+
   return Reserved;
 }
 
Index: lib/Target/AMDGPU/SIWholeQuadMode.cpp
===================================================================
--- lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -654,8 +654,7 @@
   MachineInstr *MI;
 
   assert(SaveOrig);
-  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_OR_SAVEEXEC_B64),
-               SaveOrig)
+  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig)
            .addImm(-1);
   LIS->InsertMachineInstrInMaps(*MI);
 }
Index: test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
===================================================================
--- test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -112,7 +112,7 @@
 ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
 ; GFX7LESS-NOT: s_bcnt1_i32_b64
 ; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
-; GFX8MORE: v_mov_b32_dpp v[[wave_shr1:[0-9]+]], v[[sub_value:[0-9]+]] wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8MORE: v_mov_b32_dpp v[[wave_shr1:[0-9]+]], v{{[0-9]+}} wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:3 row_mask:0xf bank_mask:0xf
@@ -120,8 +120,7 @@
 ; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:8 row_mask:0xf bank_mask:0xc
 ; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:15 row_mask:0xa bank_mask:0xf
 ; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8MORE: v_sub_u32_e32 v[[sub_value]],{{( vcc,)?}} v[[sub_value]], v{{[0-9]+}}
-; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v[[sub_value]], 63
+; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GFX8MORE: buffer_atomic_sub v[[value]]
 define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
Index: test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
===================================================================
--- test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 
@@ -133,9 +133,7 @@
 ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
 ; GFX7LESS-NOT: s_bcnt1_i32_b64
 ; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
-; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[sub_value:[0-9]+]] wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8MORE: v_sub_u32_e32 v[[sub_value]],{{( vcc,)?}} v[[sub_value]], v{{[0-9]+}}
-; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v[[sub_value]], 63
+; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GFX8MORE: buffer_atomic_sub v[[value]]
 define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
Index: test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
===================================================================
--- test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -136,9 +136,7 @@
 ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
 ; GFX7LESS-NOT: s_bcnt1_i32_b64
 ; GFX7LESS: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[sub_value:[0-9]+]] wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8MORE: v_sub_u32_e32 v[[sub_value]],{{( vcc,)?}} v[[sub_value]], v{{[0-9]+}}
-; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v[[sub_value]], 63
+; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GFX8MORE: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
 define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
Index: test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
===================================================================
--- test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -104,9 +104,7 @@
 ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
 ; GFX7LESS-NOT: s_bcnt1_i32_b64
 ; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
-; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[sub_value:[0-9]+]] wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8MORE: v_sub_u32_e32 v[[sub_value]],{{( vcc,)?}} v[[sub_value]], v{{[0-9]+}}
-; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v[[sub_value]], 63
+; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GFX8MORE: buffer_atomic_sub v[[value]]
 define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
Index: test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
===================================================================
--- test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -117,9 +117,7 @@
 ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
 ; GFX7LESS-NOT: s_bcnt1_i32_b64
 ; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
-; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[sub_value:[0-9]+]] wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8MORE: v_sub_u32_e32 v[[sub_value]],{{( vcc,)?}} v[[sub_value]], v{{[0-9]+}}
-; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v[[sub_value]], 63
+; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GFX8MORE: buffer_atomic_sub v[[value]]
 define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
Index: test/CodeGen/AMDGPU/fix-wwm-liveness.mir
===================================================================
--- test/CodeGen/AMDGPU/fix-wwm-liveness.mir
+++ /dev/null
@@ -1,185 +0,0 @@
-# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-fix-wwm-liveness -o -  %s | FileCheck %s
-
-# Test a then phi value.
-#CHECK: test_wwm_liveness_then_phi
-#CHECK: %21:vgpr_32 = V_MOV_B32_e32 1, implicit $exec, implicit %21
-
----
-name:            test_wwm_liveness_then_phi
-alignment:       0
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-registers:       
-  - { id: 0, class: sreg_64, preferred-register: '' }
-  - { id: 1, class: sgpr_32, preferred-register: '' }
-  - { id: 2, class: sgpr_32, preferred-register: '' }
-  - { id: 3, class: vgpr_32, preferred-register: '' }
-  - { id: 4, class: vgpr_32, preferred-register: '' }
-  - { id: 5, class: vgpr_32, preferred-register: '' }
-  - { id: 6, class: vgpr_32, preferred-register: '' }
-  - { id: 7, class: vgpr_32, preferred-register: '' }
-  - { id: 8, class: sreg_64, preferred-register: '$vcc' }
-  - { id: 9, class: sreg_64, preferred-register: '' }
-  - { id: 10, class: sreg_32_xm0, preferred-register: '' }
-  - { id: 11, class: sreg_64, preferred-register: '' }
-  - { id: 12, class: sreg_32_xm0, preferred-register: '' }
-  - { id: 13, class: sreg_32_xm0, preferred-register: '' }
-  - { id: 14, class: sreg_32_xm0, preferred-register: '' }
-  - { id: 15, class: sreg_128, preferred-register: '' }
-  - { id: 16, class: vgpr_32, preferred-register: '' }
-  - { id: 17, class: vgpr_32, preferred-register: '' }
-  - { id: 18, class: vgpr_32, preferred-register: '' }
-  - { id: 19, class: sreg_64, preferred-register: '' }
-  - { id: 20, class: sreg_64, preferred-register: '' }
-  - { id: 21, class: vgpr_32, preferred-register: '' }
-  - { id: 22, class: sreg_64, preferred-register: '' }
-  - { id: 23, class: sreg_64, preferred-register: '' }
-liveins:         
-body:             |
-  bb.0:
-    successors: %bb.1(0x40000000), %bb.2(0x40000000)
-  
-    %21 = V_MOV_B32_e32 0, implicit $exec
-    %5 = V_MBCNT_LO_U32_B32_e64 -1, 0, implicit $exec
-    %6 = V_MBCNT_HI_U32_B32_e32 -1, killed %5, implicit $exec
-    %8 = V_CMP_GT_U32_e64 32, killed %6, implicit $exec
-    %22 = COPY $exec, implicit-def $exec
-    %23 = S_AND_B64 %22, %8, implicit-def dead $scc
-    %0 = S_XOR_B64 %23, %22, implicit-def dead $scc
-    $exec = S_MOV_B64_term killed %23
-    SI_MASK_BRANCH %bb.2, implicit $exec
-    S_BRANCH %bb.1
-  
-  bb.1:
-    successors: %bb.2(0x80000000)
-  
-    %13 = S_MOV_B32 61440
-    %14 = S_MOV_B32 -1
-    %15 = REG_SEQUENCE undef %12, 1, undef %10, 2, killed %14, 3, killed %13, 4
-    %19 = COPY $exec
-    $exec = S_MOV_B64 -1
-    %16 = BUFFER_LOAD_DWORD_OFFSET %15, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4)
-    %17 = V_ADD_F32_e32 1065353216, killed %16, implicit $exec
-    $exec = EXIT_WWM killed %19
-    %21 = V_MOV_B32_e32 1, implicit $exec
-    early-clobber %18 = WWM killed %17, implicit $exec
-    BUFFER_STORE_DWORD_OFFSET killed %18, killed %15, 0, 0, 0, 0, 0, implicit $exec :: (store 4)
-  
-  bb.2:
-    $exec = S_OR_B64 $exec, killed %0, implicit-def $scc
-    $vgpr0 = COPY killed %21
-    SI_RETURN_TO_EPILOG killed $vgpr0
-
-...
-
-# Test a loop with a loop exit value and a loop phi.
-#CHECK: test_wwm_liveness_loop
-#CHECK: %4:vgpr_32 = IMPLICIT_DEF
-#CHECK: bb.1:
-#CHECK: %4:vgpr_32 = FLAT_LOAD_DWORD{{.*}}, implicit %4
-#CHECK: %27:vgpr_32 = COPY killed %21, implicit %27
-
----
-name:            test_wwm_liveness_loop
-alignment:       0
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
-tracksRegLiveness: true
-registers:       
-  - { id: 0, class: vgpr_32, preferred-register: '' }
-  - { id: 1, class: sreg_32_xm0, preferred-register: '' }
-  - { id: 2, class: sreg_64, preferred-register: '' }
-  - { id: 3, class: sreg_32_xm0, preferred-register: '' }
-  - { id: 4, class: vgpr_32, preferred-register: '' }
-  - { id: 5, class: sreg_32_xm0, preferred-register: '' }
-  - { id: 6, class: sreg_64, preferred-register: '' }
-  - { id: 7, class: sreg_64, preferred-register: '' }
-  - { id: 8, class: sreg_64, preferred-register: '' }
-  - { id: 9, class: vreg_64, preferred-register: '' }
-  - { id: 10, class: vgpr_32, preferred-register: '' }
-  - { id: 11, class: vgpr_32, preferred-register: '' }
-  - { id: 12, class: vgpr_32, preferred-register: '' }
-  - { id: 13, class: sreg_64, preferred-register: '' }
-  - { id: 14, class: vreg_64, preferred-register: '' }
-  - { id: 15, class: sreg_32_xm0, preferred-register: '' }
-  - { id: 16, class: vgpr_32, preferred-register: '' }
-  - { id: 17, class: sreg_64, preferred-register: '$vcc' }
-  - { id: 18, class: vgpr_32, preferred-register: '' }
-  - { id: 19, class: vgpr_32, preferred-register: '' }
-  - { id: 20, class: vgpr_32, preferred-register: '' }
-  - { id: 21, class: vgpr_32, preferred-register: '' }
-  - { id: 22, class: vgpr_32, preferred-register: '' }
-  - { id: 23, class: sreg_64, preferred-register: '' }
-  - { id: 24, class: sreg_64, preferred-register: '' }
-  - { id: 25, class: sreg_64, preferred-register: '' }
-  - { id: 26, class: sreg_64, preferred-register: '' }
-  - { id: 27, class: vgpr_32, preferred-register: '' }
-liveins:         
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  stackProtector:  ''
-  maxCallFrameSize: 4294967295
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
-fixedStack:      
-stack:           
-constants:       
-body:             |
-  bb.0:
-    successors: %bb.1(0x80000000)
-  
-    %25:sreg_64 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    %0:vgpr_32 = FLAT_LOAD_DWORD undef %9:vreg_64, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* undef`, addrspace 1)
-    $exec = EXIT_WWM killed %25
-    %12:vgpr_32 = V_MBCNT_LO_U32_B32_e64 -1, 0, implicit $exec
-    %7:sreg_64 = S_MOV_B64 0
-    %26:sreg_64 = COPY killed %7
-    %27:vgpr_32 = COPY killed %12
-  
-  bb.1:
-    successors: %bb.2(0x04000000), %bb.1(0x7c000000)
-  
-    %24:sreg_64 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    %20:vgpr_32 = COPY killed %27
-    %2:sreg_64 = COPY killed %26
-    %4:vgpr_32 = FLAT_LOAD_DWORD undef %14:vreg_64, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* undef`, addrspace 1)
-    $exec = EXIT_WWM killed %24
-    %22:vgpr_32 = V_ADD_I32_e32 -1, killed %20, implicit-def dead $vcc, implicit $exec
-    %17:sreg_64 = V_CMP_EQ_U32_e64 0, %22, implicit $exec
-    %6:sreg_64 = S_OR_B64 killed %17, killed %2, implicit-def $scc
-    %21:vgpr_32 = COPY killed %22
-    %26:sreg_64 = COPY %6
-    %27:vgpr_32 = COPY killed %21
-    $exec = S_ANDN2_B64_term $exec, %6
-    S_CBRANCH_EXECNZ %bb.1, implicit $exec
-    S_BRANCH %bb.2
-  
-  bb.2:
-    $exec = S_OR_B64 $exec, killed %6, implicit-def $scc
-    %23:sreg_64 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    %18:vgpr_32 = V_ADD_F32_e32 killed %0, killed %4, implicit $exec
-    $exec = EXIT_WWM killed %23
-    early-clobber %19:vgpr_32 = COPY killed %18, implicit $exec
-    $vgpr0 = COPY killed %19
-    SI_RETURN_TO_EPILOG killed $vgpr0
-
-...
-
Index: test/CodeGen/AMDGPU/indirect-addressing-term.ll
===================================================================
--- test/CodeGen/AMDGPU/indirect-addressing-term.ll
+++ test/CodeGen/AMDGPU/indirect-addressing-term.ll
@@ -81,28 +81,25 @@
   ; GCN: bb.1:
   ; GCN:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; GCN:   $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3, implicit-def dead $m0 :: (load 8 from %stack.5, align 4, addrspace 5)
-  ; GCN:   $vgpr0 = SI_SPILL_V32_RESTORE %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.4, addrspace 5)
-  ; GCN:   $vgpr1 = SI_SPILL_V32_RESTORE %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
-  ; GCN:   renamable $sgpr2 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
-  ; GCN:   renamable $sgpr4_sgpr5 = V_CMP_EQ_U32_e64 $sgpr2, killed $vgpr1, implicit $exec
-  ; GCN:   renamable $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 killed renamable $sgpr4_sgpr5, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GCN:   $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
+  ; GCN:   renamable $sgpr2 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+  ; GCN:   renamable $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr2, killed $vgpr0, implicit $exec
+  ; GCN:   renamable $sgpr0_sgpr1 = S_AND_SAVEEXEC_B64 killed renamable $sgpr0_sgpr1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GCN:   S_SET_GPR_IDX_ON killed renamable $sgpr2, 1, implicit-def $m0, implicit undef $m0
-  ; GCN:   $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = SI_SPILL_V512_RESTORE %stack.2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 64 from %stack.2, align 4, addrspace 5)
-  ; GCN:   renamable $vgpr18 = V_MOV_B32_e32 undef $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, implicit $m0
+  ; GCN:   $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 = SI_SPILL_V512_RESTORE %stack.2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 64 from %stack.2, align 4, addrspace 5)
+  ; GCN:   renamable $vgpr17 = V_MOV_B32_e32 undef $vgpr2, implicit $exec, implicit killed $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16, implicit $m0
   ; GCN:   S_SET_GPR_IDX_OFF
-  ; GCN:   renamable $vgpr19 = COPY renamable $vgpr18
-  ; GCN:   renamable $sgpr6_sgpr7 = COPY renamable $sgpr4_sgpr5
-  ; GCN:   SI_SPILL_S64_SAVE killed $sgpr6_sgpr7, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3, implicit-def dead $m0 :: (store 8 into %stack.5, align 4, addrspace 5)
-  ; GCN:   SI_SPILL_S64_SAVE killed $sgpr0_sgpr1, %stack.6, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3, implicit-def dead $m0 :: (store 8 into %stack.6, align 4, addrspace 5)
-  ; GCN:   SI_SPILL_V32_SAVE killed $vgpr19, %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5)
-  ; GCN:   SI_SPILL_V32_SAVE killed $vgpr0, %stack.7, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.7, addrspace 5)
-  ; GCN:   SI_SPILL_V32_SAVE killed $vgpr18, %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.8, addrspace 5)
-  ; GCN:   $exec = S_XOR_B64_term $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
+  ; GCN:   renamable $vgpr18 = COPY renamable $vgpr17
+  ; GCN:   renamable $sgpr4_sgpr5 = COPY renamable $sgpr0_sgpr1
+  ; GCN:   SI_SPILL_S64_SAVE killed $sgpr4_sgpr5, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3, implicit-def dead $m0 :: (store 8 into %stack.5, align 4, addrspace 5)
+  ; GCN:   SI_SPILL_V32_SAVE killed $vgpr18, %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5)
+  ; GCN:   SI_SPILL_V32_SAVE killed $vgpr17, %stack.6, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.6, addrspace 5)
+  ; GCN:   $exec = S_XOR_B64_term $exec, killed renamable $sgpr0_sgpr1, implicit-def $scc
   ; GCN:   S_CBRANCH_EXECNZ %bb.1, implicit $exec
   ; GCN: bb.2:
   ; GCN:   $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3, implicit-def dead $m0 :: (load 8 from %stack.3, align 4, addrspace 5)
   ; GCN:   $exec = S_MOV_B64 killed renamable $sgpr0_sgpr1
-  ; GCN:   $vgpr0 = SI_SPILL_V32_RESTORE %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.8, addrspace 5)
+  ; GCN:   $vgpr0 = SI_SPILL_V32_RESTORE %stack.6, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.6, addrspace 5)
   ; GCN:   $sgpr4_sgpr5_sgpr6_sgpr7 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3, implicit-def dead $m0 :: (load 16 from %stack.1, align 4, addrspace 5)
   ; GCN:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr0, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.load, addrspace 1)
   ; GCN:   S_ENDPGM
Index: test/CodeGen/AMDGPU/wqm.mir
===================================================================
--- test/CodeGen/AMDGPU/wqm.mir
+++ test/CodeGen/AMDGPU/wqm.mir
@@ -3,7 +3,7 @@
 ---
 # Check for awareness that s_or_saveexec_b64 clobbers SCC
 #
-#CHECK: S_OR_SAVEEXEC_B64
+#CHECK: ENTER_WWM
 #CHECK: S_CMP_LT_I32
 #CHECK: S_CSELECT_B32
 name:            test_wwm_scc
Index: test/CodeGen/AMDGPU/wwm-reserved.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -0,0 +1,166 @@
+; RUN: llc -O0 -march=amdgcn -mcpu=gfx900 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX9,GFX9-O0 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX9,GFX9-O3 %s
+
+define amdgpu_cs void @no_cfg(<4 x i32> inreg %tmp14) {
+  %tmp100 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %tmp14, i32 0, i32 0, i32 0)
+  %tmp101 = bitcast <2 x float> %tmp100 to <2 x i32>
+  %tmp102 = extractelement <2 x i32> %tmp101, i32 0
+  %tmp103 = extractelement <2 x i32> %tmp101, i32 1
+  %tmp105 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp102, i32 0)
+  %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp103, i32 0)
+  %tmp108 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 273, i32 15, i32 15, i1 false)
+  %tmp109 = add i32 %tmp108, %tmp105
+  %tmp110 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 274, i32 15, i32 15, i1 false)
+  %tmp111 = add i32 %tmp109, %tmp110
+  %tmp112 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 275, i32 15, i32 15, i1 false)
+  %tmp113 = add i32 %tmp111, %tmp112
+  %tmp114 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp113, i32 276, i32 15, i32 14, i1 false)
+  %tmp115 = add i32 %tmp113, %tmp114
+  %tmp116 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp115, i32 280, i32 15, i32 12, i1 false)
+  %tmp117 = add i32 %tmp115, %tmp116
+  %tmp118 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp117, i32 322, i32 10, i32 15, i1 false)
+  %tmp119 = add i32 %tmp117, %tmp118
+  %tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp119, i32 323, i32 12, i32 15, i1 false)
+  %tmp121 = add i32 %tmp119, %tmp120
+  %tmp122 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp121)
+; GFX9: v_mov_b32_dpp v[[FIRST_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9: v_add_u32_e32 v[[FIRST_ADD:[0-9]+]], v{{[0-9]+}}, v[[FIRST_MOV]]
+; GFX9: v_mov_b32_e32 v[[FIRST:[0-9]+]], v[[FIRST_ADD]]
+
+  %tmp123 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 273, i32 15, i32 15, i1 false)
+  %tmp124 = add i32 %tmp123, %tmp107
+  %tmp125 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 274, i32 15, i32 15, i1 false)
+  %tmp126 = add i32 %tmp124, %tmp125
+  %tmp127 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 275, i32 15, i32 15, i1 false)
+  %tmp128 = add i32 %tmp126, %tmp127
+  %tmp129 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp128, i32 276, i32 15, i32 14, i1 false)
+  %tmp130 = add i32 %tmp128, %tmp129
+  %tmp131 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp130, i32 280, i32 15, i32 12, i1 false)
+  %tmp132 = add i32 %tmp130, %tmp131
+  %tmp133 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp132, i32 322, i32 10, i32 15, i1 false)
+  %tmp134 = add i32 %tmp132, %tmp133
+  %tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp134, i32 323, i32 12, i32 15, i1 false)
+  %tmp136 = add i32 %tmp134, %tmp135
+  %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136)
+; GFX9: v_mov_b32_dpp v[[SECOND_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9: v_add_u32_e32 v[[SECOND_ADD:[0-9]+]], v{{[0-9]+}}, v[[SECOND_MOV]]
+; GFX9: v_mov_b32_e32 v[[SECOND:[0-9]+]], v[[SECOND_ADD]]
+
+; GFX9-O3: v_cmp_eq_u32_e32 vcc, v[[FIRST]], v[[SECOND]]
+; GFX9-O0: v_cmp_eq_u32_e64 s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v[[FIRST]], v[[SECOND]]
+  %tmp138 = icmp eq i32 %tmp122, %tmp137
+  %tmp139 = sext i1 %tmp138 to i32
+  %tmp140 = shl nsw i32 %tmp139, 1
+  %tmp141 = and i32 %tmp140, 2
+  %tmp145 = bitcast i32 %tmp141 to float
+  call void @llvm.amdgcn.raw.buffer.store.f32(float %tmp145, <4 x i32> %tmp14, i32 4, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_cs void @cfg(<4 x i32> inreg %tmp14, i32 %arg) {
+entry:
+  %tmp100 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %tmp14, i32 0, i32 0, i32 0)
+  %tmp101 = bitcast <2 x float> %tmp100 to <2 x i32>
+  %tmp102 = extractelement <2 x i32> %tmp101, i32 0
+  %tmp105 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp102, i32 0)
+  %tmp108 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 273, i32 15, i32 15, i1 false)
+  %tmp109 = add i32 %tmp108, %tmp105
+  %tmp110 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 274, i32 15, i32 15, i1 false)
+  %tmp111 = add i32 %tmp109, %tmp110
+  %tmp112 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 275, i32 15, i32 15, i1 false)
+  %tmp113 = add i32 %tmp111, %tmp112
+  %tmp114 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp113, i32 276, i32 15, i32 14, i1 false)
+  %tmp115 = add i32 %tmp113, %tmp114
+  %tmp116 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp115, i32 280, i32 15, i32 12, i1 false)
+  %tmp117 = add i32 %tmp115, %tmp116
+  %tmp118 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp117, i32 322, i32 10, i32 15, i1 false)
+  %tmp119 = add i32 %tmp117, %tmp118
+
+
+; GFX9: v_mov_b32_dpp v[[FIRST_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9: v_add_u32_e32 v[[FIRST_ADD:[0-9]+]], v{{[0-9]+}}, v[[FIRST_MOV]]
+; GFX9: v_mov_b32_e32 v[[FIRST:[0-9]+]], v[[FIRST_ADD]]
+; GFX9-O0: buffer_store_dword v[[FIRST]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, s[[FIRST_SGPR_OFFSET:[0-9]+]] offset:[[FIRST_IMM_OFFSET:[0-9]+]]
+  %tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp119, i32 323, i32 12, i32 15, i1 false)
+  %tmp121 = add i32 %tmp119, %tmp120
+  %tmp122 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp121)
+
+  %cond = icmp eq i32 %arg, 0
+  br i1 %cond, label %if, label %merge
+if:
+  %tmp103 = extractelement <2 x i32> %tmp101, i32 1
+  %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp103, i32 0)
+  %tmp123 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 273, i32 15, i32 15, i1 false)
+  %tmp124 = add i32 %tmp123, %tmp107
+  %tmp125 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 274, i32 15, i32 15, i1 false)
+  %tmp126 = add i32 %tmp124, %tmp125
+  %tmp127 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 275, i32 15, i32 15, i1 false)
+  %tmp128 = add i32 %tmp126, %tmp127
+  %tmp129 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp128, i32 276, i32 15, i32 14, i1 false)
+  %tmp130 = add i32 %tmp128, %tmp129
+  %tmp131 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp130, i32 280, i32 15, i32 12, i1 false)
+  %tmp132 = add i32 %tmp130, %tmp131
+  %tmp133 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp132, i32 322, i32 10, i32 15, i1 false)
+  %tmp134 = add i32 %tmp132, %tmp133
+
+; GFX9: v_mov_b32_dpp v[[SECOND_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9: v_add_u32_e32 v[[SECOND_ADD:[0-9]+]], v{{[0-9]+}}, v[[SECOND_MOV]]
+; GFX9: v_mov_b32_e32 v[[SECOND:[0-9]+]], v[[SECOND_ADD]]
+; GFX9-O0: buffer_store_dword v[[SECOND]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, s[[SECOND_SGPR_OFFSET:[0-9]+]] offset:[[SECOND_IMM_OFFSET:[0-9]+]]
+  %tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp134, i32 323, i32 12, i32 15, i1 false)
+  %tmp136 = add i32 %tmp134, %tmp135
+  %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136)
+  br label %merge
+
+merge:
+  %merge_value = phi i32 [ 0, %entry ], [%tmp137, %if ]
+; GFX9-O3: v_cmp_eq_u32_e32 vcc, v[[FIRST]], v[[SECOND]]
+; GFX9-O0: buffer_load_dword v[[SECOND:[0-9]+]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, s[[SECOND_SGPR_OFFSET]] offset:[[SECOND_IMM_OFFSET]]
+; GFX9-O0: buffer_load_dword v[[FIRST:[0-9]+]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, s[[FIRST_SGPR_OFFSET]] offset:[[FIRST_IMM_OFFSET]]
+; GFX9-O0: v_cmp_eq_u32_e64 s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v[[FIRST]], v[[SECOND]]
+  %tmp138 = icmp eq i32 %tmp122, %merge_value
+  %tmp139 = sext i1 %tmp138 to i32
+  %tmp140 = shl nsw i32 %tmp139, 1
+  %tmp141 = and i32 %tmp140, 2
+  %tmp145 = bitcast i32 %tmp141 to float
+  call void @llvm.amdgcn.raw.buffer.store.f32(float %tmp145, <4 x i32> %tmp14, i32 4, i32 0, i32 0)
+  ret void
+}
+
+define i32 @called(i32 %a) noinline {
+; GFX9: v_add_u32_e32 v1, v0, v0
+  %add = add i32 %a, %a
+; GFX9: v_mul_lo_i32 v0, v1, v0
+  %mul = mul i32 %add, %a
+; GFX9: v_sub_u32_e32 v0, v0, v1
+  %sub = sub i32 %mul, %add
+  ret i32 %sub
+}
+
+define amdgpu_kernel void @call(<4 x i32> inreg %tmp14, i32 inreg %arg) {
+; GFX9-O0: v_mov_b32_e32 v2, v0
+; GFX9-O3: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_not_b64 exec, exec
+  %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %arg, i32 0)
+; GFX9: v_mov_b32_e32 v0, v2
+; GFX9: s_waitcnt lgkmcnt(0)
+; GFX9: s_swappc_b64
+  %tmp134 = call i32 @called(i32 %tmp107)
+; GFX9: v_mov_b32_e32 v1, v0
+; GFX9: v_add_u32_e32 v1, v1, v2
+  %tmp136 = add i32 %tmp134, %tmp107
+  %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136)
+; GFX9: buffer_store_dword v0
+  call void @llvm.amdgcn.raw.buffer.store.i32(i32 %tmp137, <4 x i32> %tmp14, i32 4, i32 0, i32 0)
+  ret void
+}
+
+declare i32 @llvm.amdgcn.wwm.i32(i32)
+declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32)
+declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1)
+declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32)
+declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32)
+declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32)