Index: llvm/trunk/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ llvm/trunk/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -68,6 +68,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -82,6 +83,9 @@
 namespace {
 
 class SIFixSGPRCopies : public MachineFunctionPass {
+
+  MachineDominatorTree *MDT;
+
 public:
   static char ID;
 
@@ -92,6 +96,8 @@
   StringRef getPassName() const override { return "SI Fix SGPR copies"; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -99,8 +105,12 @@
 
 } // End anonymous namespace
 
-INITIALIZE_PASS(SIFixSGPRCopies, DEBUG_TYPE,
-                "SI Fix SGPR copies", false, false)
+INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE,
+                     "SI Fix SGPR copies", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE,
+                     "SI Fix SGPR copies", false, false)
+
 
 char SIFixSGPRCopies::ID = 0;
 
@@ -274,11 +284,22 @@
   return false;
 }
 
+static bool hasTerminatorThatModifiesExec(const MachineBasicBlock &MBB,
+                                          const TargetRegisterInfo &TRI) {
+  for (MachineBasicBlock::const_iterator I = MBB.getFirstTerminator(),
+       E = MBB.end(); I != E; ++I) {
+    if (I->modifiesRegister(AMDGPU::EXEC, &TRI))
+      return true;
+  }
+  return false;
+}
+
 bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
   const SIInstrInfo *TII = ST.getInstrInfo();
+  MDT = &getAnalysis<MachineDominatorTree>();
 
   SmallVector<MachineInstr *, 16> Worklist;
 
@@ -309,11 +330,23 @@
         break;
       }
       case AMDGPU::PHI: {
-        DEBUG(dbgs() << "Fixing PHI: " << MI);
         unsigned Reg = MI.getOperand(0).getReg();
         if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
           break;
 
+        // We don't need to fix the PHI if the common dominator of the
+        // two incoming blocks terminates with a uniform branch.
+        if (MI.getNumExplicitOperands() == 5) {
+          MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB();
+          MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB();
+
+          MachineBasicBlock *NCD = MDT->findNearestCommonDominator(MBB0, MBB1);
+          if (NCD && !hasTerminatorThatModifiesExec(*NCD, *TRI)) {
+            DEBUG(dbgs() << "Not fixing PHI for uniform branch: " << MI << '\n');
+            break;
+          }
+        }
+
         // If a PHI node defines an SGPR and any of its operands are VGPRs,
         // then we need to move it to the VALU.
         //
@@ -340,10 +373,6 @@
         // ...
         // use sgpr2
         //
-        // FIXME: This is OK if the branching decision is made based on an
-        // SGPR value.
-        bool SGPRBranch = false;
-
         // The one exception to this rule is when one of the operands
         // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK
         // instruction.  In this case, there we know the program will
@@ -353,7 +382,8 @@
 
         SmallSet<unsigned, 8> Visited;
         if (phiHasVGPROperands(MI, MRI, TRI, TII) ||
-            (!SGPRBranch && !phiHasBreakDef(MI, MRI, Visited))) {
+            !phiHasBreakDef(MI, MRI, Visited)) {
+          DEBUG(dbgs() << "Fixing PHI: " << MI);
           TII->moveToVALU(MI);
         }
         break;
Index: llvm/trunk/test/CodeGen/AMDGPU/branch-relaxation.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -163,15 +163,13 @@
   ret void
 }
 
-; FIXME: Should be able to use s_cbranch_scc0
 ; GCN-LABEL: {{^}}long_backward_sbranch:
-; GCN: v_mov_b32_e32 [[LOOPIDX:v[0-9]+]], 0{{$}}
+; GCN: s_mov_b32 [[LOOPIDX:s[0-9]+]], 0{{$}}
 
 ; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]: ; %bb2
 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT: v_add_i32_e32 [[INC:v[0-9]+]], vcc, 1, [[LOOPIDX]]
-; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 10, [[INC]]
-; GCN-NEXT: s_and_b64 vcc, exec, vcc
+; GCN-NEXT: s_add_i32 [[INC:s[0-9]+]], [[LOOPIDX]], 1
+; GCN-NEXT: s_cmp_lt_i32 [[INC]], 10
 
 ; GCN-NEXT: ;;#ASMSTART
 ; GCN-NEXT: v_nop_e64
@@ -179,7 +177,7 @@
 ; GCN-NEXT: v_nop_e64
 ; GCN-NEXT: ;;#ASMEND
 
-; GCN-NEXT: s_cbranch_vccz [[ENDBB:BB[0-9]+_[0-9]+]]
+; GCN-NEXT: s_cbranch_scc0 [[ENDBB:BB[0-9]+_[0-9]+]]
 
 ; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb2
 ; GCN-NEXT: ; in Loop: Header=[[LOOPBB]] Depth=1
Index: llvm/trunk/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
@@ -98,6 +98,9 @@
 ; GCN: v_cmp_eq_u32_e32 vcc, 1,
 
 ; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]
+; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80
+; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, 4
+
 ; GCN: s_cbranch_vccnz [[LOOPBB]]
 ; GCN-NEXT: ; BB#2
 ; GCN-NEXT: s_endpgm
Index: llvm/trunk/test/CodeGen/AMDGPU/coalescer_remat.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/coalescer_remat.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/coalescer_remat.ll
@@ -12,7 +12,7 @@
 ; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
 ; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
 ; It's probably OK if this is slightly higher:
-; CHECK: ; NumVgprs: 9
+; CHECK: ; NumVgprs: 8
 define void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) {
 entry:
   %cmpflag = icmp eq i32 %flag, 1
Index: llvm/trunk/test/CodeGen/AMDGPU/hoist-cond.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/hoist-cond.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/hoist-cond.ll
@@ -4,7 +4,7 @@
 ; At the same time condition shall not be serialized into a VGPR and deserialized later
 ; using another v_cmp + v_cndmask, but used directly in s_and_saveexec_b64.
 
-; CHECK: v_cmp_{{..}}_u32_e64 [[COND:s\[[0-9]+:[0-9]+\]]]
+; CHECK: v_cmp_{{..}}_u32_e{{32|64}} [[COND:s\[[0-9]+:[0-9]+\]|vcc]]
 ; CHECK: BB0_1:
 ; CHECK-NOT: v_cmp
 ; CHECK_NOT: v_cndmask
Index: llvm/trunk/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
@@ -1,5 +1,8 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
+; FIXME: Enabling critical edge splitting will fix this.
+; XFAIL: *
+
 ; Make sure that m0 is not reinitialized in the loop.
 
 ; GCN-LABEL: {{^}}copy_local_to_global_loop_m0_init:
@@ -12,7 +15,7 @@
 ; GCN: ds_read_b32
 ; GCN: buffer_store_dword
 
-; GCN: s_cbranch_vccz BB0_2
+; GCN: s_cbranch_scc0 BB0_2
 
 ; GCN: BB0_3:
 ; GCN-NEXT: s_endpgm
Index: llvm/trunk/test/CodeGen/AMDGPU/loop_break.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/loop_break.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/loop_break.ll
@@ -27,9 +27,8 @@
 
 ; GCN: [[LOOP_ENTRY:BB[0-9]+_[0-9]+]]: ; %bb1
 ; GCN: s_or_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[INITMASK]]
-; GCN: v_cmp_lt_i32_e32 vcc,
-; GCN: s_and_b64 vcc, exec, vcc
-; GCN-NEXT: s_cbranch_vccnz [[FLOW:BB[0-9]+_[0-9]+]]
+; GCN: s_cmp_gt_i32 s{{[0-9]+}}, -1
+; GCN-NEXT: s_cbranch_scc1 [[FLOW:BB[0-9]+_[0-9]+]]
 
 ; GCN: ; BB#2: ; %bb4
 ; GCN: buffer_load_dword
Index: llvm/trunk/test/CodeGen/AMDGPU/uniform-cfg.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/uniform-cfg.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/uniform-cfg.ll
@@ -1,16 +1,17 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=verde -machine-sink-split-probability-threshold=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -machine-sink-split-probability-threshold=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}uniform_if_scc:
 ; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0
-; GCN-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0
+; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0
 ; GCN: s_cbranch_scc1 [[IF_LABEL:[0-9_A-Za-z]+]]
 
 ; Fall-through to the else
-; GCN: v_mov_b32_e32 [[STORE_VAL]], 1
+; GCN: s_mov_b32 [[S_VAL]], 1
 
 ; GCN: [[IF_LABEL]]:
-; GCN: buffer_store_dword [[STORE_VAL]]
+; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
+; GCN: buffer_store_dword [[V_VAL]]
 define void @uniform_if_scc(i32 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
@@ -29,17 +30,16 @@
 }
 
 ; GCN-LABEL: {{^}}uniform_if_vcc:
-; FIXME: We could use _e32 here if we re-used the 0 from [[STORE_VAL]], and
-; also scheduled the write first.
 ; GCN-DAG: v_cmp_eq_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0
+; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0
 ; GCN: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]]
 
 ; Fall-through to the else
-; GCN: v_mov_b32_e32 [[STORE_VAL]], 1
+; GCN: s_mov_b32 [[S_VAL]], 1
 
 ; GCN: [[IF_LABEL]]:
-; GCN: buffer_store_dword [[STORE_VAL]]
+; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
+; GCN: buffer_store_dword [[V_VAL]]
 define void @uniform_if_vcc(float %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = fcmp oeq float %cond, 0.0
@@ -59,14 +59,15 @@
 
 ; GCN-LABEL: {{^}}uniform_if_swap_br_targets_scc:
 ; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0
-; GCN-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0
+; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0
 ; GCN: s_cbranch_scc1 [[IF_LABEL:[0-9_A-Za-z]+]]
 
 ; Fall-through to the else
-; GCN: v_mov_b32_e32 [[STORE_VAL]], 1
+; GCN: s_mov_b32 [[S_VAL]], 1
 
 ; GCN: [[IF_LABEL]]:
-; GCN: buffer_store_dword [[STORE_VAL]]
+; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
+; GCN: buffer_store_dword [[V_VAL]]
 define void @uniform_if_swap_br_targets_scc(i32 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
@@ -85,17 +86,16 @@
 }
 
 ; GCN-LABEL: {{^}}uniform_if_swap_br_targets_vcc:
-; FIXME: We could use _e32 here if we re-used the 0 from [[STORE_VAL]], and
-; also scheduled the write first.
 ; GCN-DAG: v_cmp_neq_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0
+; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0
 ; GCN: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]]
 
 ; Fall-through to the else
-; GCN: v_mov_b32_e32 [[STORE_VAL]], 1
+; GCN: s_mov_b32 [[S_VAL]], 1
 
 ; GCN: [[IF_LABEL]]:
-; GCN: buffer_store_dword [[STORE_VAL]]
+; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
+; GCN: buffer_store_dword [[V_VAL]]
 define void @uniform_if_swap_br_targets_vcc(float %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = fcmp oeq float %cond, 0.0
@@ -276,15 +276,12 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}uniform_loop:
-; GCN: {{^}}[[LOOP_LABEL:[A-Z0-9_a-z]+]]:
-; FIXME: We need to teach GCNFixSGPRCopies about uniform branches so we
-;        get s_add_i32 here.
-; GCN: v_add_i32_e32 [[I:v[0-9]+]], vcc, -1, v{{[0-9]+}}
-; GCN: v_cmp_ne_u32_e32 vcc, 0, [[I]]
-; GCN: s_and_b64 vcc, exec, vcc
-; GCN: s_cbranch_vccnz [[LOOP_LABEL]]
-; GCN: s_endpgm
+; SI-LABEL: {{^}}uniform_loop:
+; SI: {{^}}[[LOOP_LABEL:[A-Z0-9_a-z]+]]:
+; SI: s_add_i32 [[I:s[0-9]+]],  s{{[0-9]+}}, -1
+; SI: s_cmp_lg_u32 [[I]], 0
+; SI: s_cbranch_scc1 [[LOOP_LABEL]]
+; SI: s_endpgm
 define void @uniform_loop(i32 addrspace(1)* %out, i32 %a) {
 entry:
   br label %loop
@@ -433,7 +430,7 @@
 
 ; GCN-LABEL: {{^}}uniform_if_scc_i64_eq:
 ; VI-DAG: s_cmp_eq_u64 s{{\[[0-9]+:[0-9]+\]}}, 0
-; GCN-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0
+; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0
 
 ; SI: v_cmp_eq_u64_e64
 ; SI: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]]
@@ -441,10 +438,11 @@
 ; VI: s_cbranch_scc1 [[IF_LABEL:[0-9_A-Za-z]+]]
 
 ; Fall-through to the else
-; GCN: v_mov_b32_e32 [[STORE_VAL]], 1
+; GCN: s_mov_b32 [[S_VAL]], 1
 
 ; GCN: [[IF_LABEL]]:
-; GCN: buffer_store_dword [[STORE_VAL]]
+; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
+; GCN: buffer_store_dword [[V_VAL]]
 define void @uniform_if_scc_i64_eq(i64 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp eq i64 %cond, 0
@@ -464,7 +462,7 @@
 
 ; GCN-LABEL: {{^}}uniform_if_scc_i64_ne:
 ; VI-DAG: s_cmp_lg_u64 s{{\[[0-9]+:[0-9]+\]}}, 0
-; GCN-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0
+; GCN-DAG: s_mov_b32 [[S_VAL:s[0-9]+]], 0
 
 ; SI: v_cmp_ne_u64_e64
 ; SI: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]]
@@ -472,10 +470,11 @@
 ; VI: s_cbranch_scc1 [[IF_LABEL:[0-9_A-Za-z]+]]
 
 ; Fall-through to the else
-; GCN: v_mov_b32_e32 [[STORE_VAL]], 1
+; GCN: s_mov_b32 [[S_VAL]], 1
 
 ; GCN: [[IF_LABEL]]:
-; GCN: buffer_store_dword [[STORE_VAL]]
+; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
+; GCN: buffer_store_dword [[V_VAL]]
 define void @uniform_if_scc_i64_ne(i64 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp ne i64 %cond, 0
@@ -494,14 +493,16 @@
 }
 
 ; GCN-LABEL: {{^}}uniform_if_scc_i64_sgt:
+; GCN: s_mov_b32 [[S_VAL:s[0-9]+]], 0
 ; GCN: v_cmp_gt_i64_e64
 ; GCN: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]]
 
 ; Fall-through to the else
-; GCN: v_mov_b32_e32 [[STORE_VAL]], 1
+; GCN: s_mov_b32 [[S_VAL]], 1
 
 ; GCN: [[IF_LABEL]]:
-; GCN: buffer_store_dword [[STORE_VAL]]
+; GCN: v_mov_b32_e32 [[V_VAL]], [[S_VAL]]
+; GCN: buffer_store_dword [[V_VAL]]
 define void @uniform_if_scc_i64_sgt(i64 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp sgt i64 %cond, 0
Index: llvm/trunk/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
@@ -7,13 +7,11 @@
 ; CHECK: s_and_saveexec_b64
 ; CHECK-NEXT: s_xor_b64
 ; CHECK-NEXT: ; mask branch
-; CHECK-NEXT: s_cbranch_execz
 
 ; CHECK-NEXT: BB{{[0-9]+_[0-9]+}}: ; %loop_body.preheader
 
 ; CHECK: [[LOOP_BODY_LABEL:BB[0-9]+_[0-9]+]]:
-; CHECK: s_and_b64 vcc, exec, vcc
-; CHECK: s_cbranch_vccz [[LOOP_BODY_LABEL]]
+; CHECK: s_cbranch_scc0 [[LOOP_BODY_LABEL]]
 
 ; CHECK: s_endpgm
 define amdgpu_ps void @test1(<8 x i32> inreg %rsrc, <2 x i32> %addr.base, i32 %y, i32 %p) {
Index: llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll
@@ -101,9 +101,8 @@
 ; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]:
 ; SI: buffer_load_dword
 ; SI-DAG: buffer_store_dword
-; SI-DAG: v_cmp_eq_u32_e32 vcc,
-; SI-DAG: s_and_b64 vcc, exec, vcc
-; SI: s_cbranch_vccz [[LABEL_LOOP]]
+; SI-DAG: s_cmpk_eq_i32 s{{[0-9]+}}, 0x100
+; SI: s_cbranch_scc0 [[LABEL_LOOP]]
 ; SI: [[LABEL_EXIT]]:
 ; SI: s_endpgm