Index: lib/Target/AMDGPU/SIFixSGPRCopies.cpp
===================================================================
--- lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -68,6 +68,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -82,6 +83,9 @@
 namespace {
 
 class SIFixSGPRCopies : public MachineFunctionPass {
+
+  MachineDominatorTree *MDT;
+
 public:
   static char ID;
 
@@ -94,6 +98,7 @@
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineDominatorTree>();
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -101,8 +106,12 @@
 
 } // End anonymous namespace
 
-INITIALIZE_PASS(SIFixSGPRCopies, DEBUG_TYPE,
-                "SI Fix SGPR copies", false, false)
+INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE,
+                     "SI Fix SGPR copies", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE,
+                     "SI Fix SGPR copies", false, false)
+
 
 char SIFixSGPRCopies::ID = 0;
 
@@ -236,11 +245,31 @@
   return true;
 }
 
+static bool hasUniformTerminator(const MachineBasicBlock *MBB) {
+  MachineBasicBlock::const_iterator Term = MBB->getFirstTerminator();
+
+  // No terminator means this is a fall-through which is a uniform branch.
+  if (Term == MBB->end())
+    return true;
+
+  switch (Term->getOpcode()) {
+  default:
+    return false;
+  case AMDGPU::S_BRANCH:
+  case AMDGPU::S_CBRANCH_SCC0:
+  case AMDGPU::S_CBRANCH_SCC1:
+  case AMDGPU::S_CBRANCH_VCCNZ:
+  case AMDGPU::S_CBRANCH_VCCZ:
+    return true;
+  }
+}
+
 bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
   const SIInstrInfo *TII = ST.getInstrInfo();
+  MDT = &getAnalysis<MachineDominatorTree>();
 
   SmallVector<MachineInstr *, 16> Worklist;
 
@@ -271,11 +300,23 @@
         break;
       }
       case AMDGPU::PHI: {
-        DEBUG(dbgs() << "Fixing PHI: " << MI);
         unsigned Reg = MI.getOperand(0).getReg();
         if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
           break;
 
+        // We don't need to fix the PHI if the common denominator of the
+        // two incoming blocks terminates with a uniform branch.
+        if (MI.getNumExplicitOperands() == 5) {
+          MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB();
+          MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB();
+
+          MachineBasicBlock *NCD = MDT->findNearestCommonDominator(MBB0, MBB1);
+          if (NCD && hasUniformTerminator(NCD)) {
+            DEBUG(dbgs() << "Not fixing PHI for uniform branch: " << MI << '\n');
+            break;
+          }
+        }
+
         // If a PHI node defines an SGPR and any of its operands are VGPRs,
         // then we need to move it to the VALU.
         //
@@ -302,10 +343,6 @@
         // ...
         // use sgpr2
         //
-        // FIXME: This is OK if the branching decision is made based on an
-        // SGPR value.
-        bool SGPRBranch = false;
-
         // The one exception to this rule is when one of the operands
         // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK
         // instruction.  In this case, there we know the program will
@@ -313,6 +350,7 @@
         // the first block (where the condition is computed), so there
         // is no chance for values to be over-written.
 
+        DEBUG(dbgs() << "Fixing PHI: " << MI);
         bool HasBreakDef = false;
         for (unsigned i = 1; i < MI.getNumOperands(); i+=2) {
           unsigned Reg = MI.getOperand(i).getReg();
@@ -336,7 +374,7 @@
           }
         }
 
-        if (!SGPRBranch && !HasBreakDef)
+        if (!HasBreakDef)
           TII->moveToVALU(MI);
         break;
       }
Index: test/CodeGen/AMDGPU/cf-loop-on-constant.ll
===================================================================
--- test/CodeGen/AMDGPU/cf-loop-on-constant.ll
+++ test/CodeGen/AMDGPU/cf-loop-on-constant.ll
@@ -97,7 +97,7 @@
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
 ; GCN: v_cmp_eq_i32_e32 vcc, 1,
 
-; GCN: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, exec, vcc
+; GCN: s_and_b64 vcc, exec, vcc
 ; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]
 ; GCN: s_cbranch_vccnz [[LOOPBB]]
 ; GCN-NEXT: ; BB#2
Index: test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
===================================================================
--- test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
+++ test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
@@ -12,7 +12,7 @@
 ; GCN: ds_read_b32
 ; GCN: buffer_store_dword
 
-; GCN: s_cbranch_vccz BB0_2
+; GCN: s_cbranch_scc0 BB0_2
 
 ; GCN: BB0_3:
 ; GCN-NEXT: s_endpgm
Index: test/CodeGen/AMDGPU/loop_break.ll
===================================================================
--- test/CodeGen/AMDGPU/loop_break.ll
+++ test/CodeGen/AMDGPU/loop_break.ll
@@ -27,9 +27,8 @@
 
 ; GCN: [[LOOP_ENTRY:BB[0-9]+_[0-9]+]]: ; %bb1
 ; GCN: s_or_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[INITMASK]]
-; GCN: v_cmp_lt_i32_e32 vcc,
-; GCN: s_and_b64 vcc, exec, vcc
-; GCN-NEXT: s_cbranch_vccnz [[FLOW:BB[0-9]+_[0-9]+]]
+; GCN: s_cmp_gt_i32 s{{[0-9]+}}, -1
+; GCN-NEXT: s_cbranch_scc1 [[FLOW:BB[0-9]+_[0-9]+]]
 
 ; GCN: ; BB#2: ; %bb4
 ; GCN: buffer_load_dword
Index: test/CodeGen/AMDGPU/uniform-cfg.ll
===================================================================
--- test/CodeGen/AMDGPU/uniform-cfg.ll
+++ test/CodeGen/AMDGPU/uniform-cfg.ll
@@ -281,12 +281,9 @@
 
 ; SI-LABEL: {{^}}uniform_loop:
 ; SI: {{^}}[[LOOP_LABEL:[A-Z0-9_a-z]+]]:
-; FIXME: We need to teach SIFixSGPRCopies about uniform branches so we
-;        get s_add_i32 here.
-; SI: v_add_i32_e32 [[I:v[0-9]+]], vcc, -1, v{{[0-9]+}}
-; SI: v_cmp_ne_i32_e32 vcc, 0, [[I]]
-; SI: s_and_b64 vcc, exec, vcc
-; SI: s_cbranch_vccnz [[LOOP_LABEL]]
+; SI: s_add_i32 [[I:s[0-9]+]],  s{{[0-9]+}}, -1
+; SI: s_cmp_lg_i32 [[I]], 0
+; SI: s_cbranch_scc1 [[LOOP_LABEL]]
 ; SI: s_endpgm
 define void @uniform_loop(i32 addrspace(1)* %out, i32 %a) {
 entry:
Index: test/CodeGen/AMDGPU/valu-i1.ll
===================================================================
--- test/CodeGen/AMDGPU/valu-i1.ll
+++ test/CodeGen/AMDGPU/valu-i1.ll
@@ -79,9 +79,8 @@
 ; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]:
 ; SI: buffer_load_dword
 ; SI-DAG: buffer_store_dword
-; SI-DAG: v_cmp_eq_i32_e32 vcc,
-; SI-DAG: s_and_b64 vcc, exec, vcc
-; SI: s_cbranch_vccz [[LABEL_LOOP]]
+; SI-DAG: s_cmp_eq_i32
+; SI: s_cbranch_scc0 [[LABEL_LOOP]]
 ; SI: [[LABEL_EXIT]]:
 ; SI: s_endpgm