Index: lib/Target/AMDGPU/SIFixSGPRCopies.cpp
===================================================================
--- lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -75,6 +75,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
+#include <queue>
+#include <unordered_set>
 
 using namespace llvm;
 
@@ -327,6 +329,31 @@
   return true;
 }
 
+static bool isPredsHasTerminator(MachineBasicBlock *MBB,
+                                 const TargetRegisterInfo *TRI) {
+  std::queue<MachineBasicBlock*> cur, next;
+  std::unordered_set<MachineBasicBlock*> mp;
+  cur.push(MBB);
+
+  while (!cur.empty()) {
+    MachineBasicBlock *mbb = cur.front();
+    cur.pop();
+    if (mp.find(mbb) == mp.end()) {
+      mp.insert(mbb);
+      
+      if (hasTerminatorThatModifiesExec(*mbb, *TRI))
+        return true;
+    
+      for (MachineBasicBlock *preds : mbb->predecessors()) {
+        next.push(preds);
+      }
+      swap(cur, next);
+    }
+  }
+
+  return false;
+}
+
 bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -383,8 +410,8 @@
           MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB();
           MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB();
 
-          MachineBasicBlock *NCD = MDT->findNearestCommonDominator(MBB0, MBB1);
-          if (NCD && !hasTerminatorThatModifiesExec(*NCD, *TRI)) {
+          if (!isPredsHasTerminator(MBB0, TRI) &&
+              !isPredsHasTerminator(MBB1, TRI)) {
             DEBUG(dbgs() << "Not fixing PHI for uniform branch: " << MI << '\n');
             break;
           }
Index: test/CodeGen/AMDGPU/sgprcopies.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/sgprcopies.ll
@@ -0,0 +1,58 @@
+; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}checkTwoBlocksWithUniformBranch
+; SI: BB0_2
+; SI: v_add
+define void @checkTwoBlocksWithUniformBranch(i32 addrspace(1)* nocapture %out, i32 %width, float %xPos, float %yPos, float %xStep, float %yStep, i32 %maxIter) {
+entry:
+  %conv = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %rem = urem i32 %conv, %width
+  %div = udiv i32 %conv, %width
+  %conv1 = sitofp i32 %rem to float
+  %x = tail call float @llvm.fmuladd.f32(float %xStep, float %conv1, float %xPos)
+  %conv2 = sitofp i32 %div to float
+  %y = tail call float @llvm.fmuladd.f32(float %yStep, float %conv2, float %yPos)
+  %yy = fmul float %y, %y
+  %xy = tail call float @llvm.fmuladd.f32(float %x, float %x, float %yy)
+  %cmp01 = fcmp ole float %xy, 4.000000e+00
+  %cmp02 = icmp ne i32 %maxIter, 0
+  %cond01 = and i1 %cmp02, %cmp01
+  br i1 %cond01, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %x_val = phi float [ %call8, %for.body ], [ %x, %for.body.preheader ]
+  %iter_val = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %y_val = phi float [ %call9, %for.body ], [ %y, %for.body.preheader ]
+  %sub = fsub float -0.000000e+00, %y_val
+  %call7 = tail call float @llvm.fmuladd.f32(float %x_val, float %x_val, float %x) #1
+  %call8 = tail call float @llvm.fmuladd.f32(float %sub, float %y_val, float %call7) #1
+  %mul = fmul float %x_val, 2.000000e+00
+  %call9 = tail call float @llvm.fmuladd.f32(float %mul, float %y_val, float %y) #1
+  %inc = add nuw i32 %iter_val, 1
+  %mul3 = fmul float %call9, %call9
+  %0 = tail call float @llvm.fmuladd.f32(float %call8, float %call8, float %mul3)
+  %cmp = fcmp ole float %0, 4.000000e+00
+  %cmp5 = icmp ult i32 %inc, %maxIter
+  %or.cond = and i1 %cmp5, %cmp
+  br i1 %or.cond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %iter.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %for.end.loopexit ]
+  %idxprom = ashr exact i32 %conv, 32
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %idxprom
+  store i32 %iter.0.lcssa, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare float @llvm.fmuladd.f32(float, float, float) #1
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { readnone }