Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1072,6 +1072,24 @@
                            + StringRef(RegName) + "\"."));
 }
 
+static void replaceSuccessorPhisWith(MachineBasicBlock &BB,
+                                     MachineBasicBlock &SplitBB) {
+  for (MachineBasicBlock *Succ : BB.successors()) {
+    for (MachineInstr &MI : *Succ) {
+      if (!MI.isPHI())
+        break;
+
+      for (unsigned I = 2, E = MI.getNumOperands(); I != E; I += 2) {
+        MachineOperand &FromBB = MI.getOperand(I);
+        if (&BB == FromBB.getMBB()) {
+          FromBB.setMBB(&SplitBB);
+          break;
+        }
+      }
+    }
+  }
+}
+
 // If kill is not the last instruction, split the block so kill is always a
 // proper terminator.
 MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
@@ -1093,20 +1111,7 @@
 
   // Fix the block phi references to point to the new block for the defs in the
   // second piece of the block.
-  for (MachineBasicBlock *Succ : BB->successors()) {
-    for (MachineInstr &MI : *Succ) {
-      if (!MI.isPHI())
-        break;
-
-      for (unsigned I = 2, E = MI.getNumOperands(); I != E; I += 2) {
-        MachineOperand &FromBB = MI.getOperand(I);
-        if (BB == FromBB.getMBB()) {
-          FromBB.setMBB(SplitBB);
-          break;
-        }
-      }
-    }
-  }
+  replaceSuccessorPhisWith(*BB, *SplitBB);
 
   MF->insert(++MachineFunction::iterator(BB), SplitBB);
   SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
@@ -1161,7 +1166,7 @@
   // Compare the just read M0 value to all possible Idx values.
   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
     .addReg(CurrentIdxReg)
-    .addOperand(IdxReg);
+    .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
 
   // Move index from VCC into M0
   if (Offset == 0) {
@@ -1232,6 +1237,8 @@
   MF->insert(MBBI, LoopBB);
   MF->insert(MBBI, RemainderBB);
 
+  replaceSuccessorPhisWith(MBB, *RemainderBB);
+
   LoopBB->addSuccessor(LoopBB);
   LoopBB->addSuccessor(RemainderBB);
 
Index: test/CodeGen/AMDGPU/indirect-addressing-si.ll
===================================================================
--- test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -503,12 +503,12 @@
 ; Test that the or is folded into the base address register instead of
 ; added to m0
 
-; GCN-LABEL: {{^}}extractelement_v4i32_or_index:
-; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
-; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
-; GCN-NOT: [[IDX_SHL]]
-; GCN: s_mov_b32 m0, [[IDX_SHL]]
-; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+; CHECK-LABEL: {{^}}extractelement_v4i32_or_index:
+; CHECK: s_load_dword [[IDX_IN:s[0-9]+]]
+; CHECK: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
+; CHECK-NOT: [[IDX_SHL]]
+; CHECK: s_mov_b32 m0, [[IDX_SHL]]
+; CHECK: v_movrels_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 define void @extractelement_v4i32_or_index(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx.in) {
 entry:
   %ld = load volatile <4 x i32>, <4  x i32> addrspace(1)* %in
@@ -519,12 +519,12 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}insertelement_v4f32_or_index:
-; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
-; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
-; GCN-NOT: [[IDX_SHL]]
-; GCN: s_mov_b32 m0, [[IDX_SHL]]
-; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+; CHECK-LABEL: {{^}}insertelement_v4f32_or_index:
+; CHECK: s_load_dword [[IDX_IN:s[0-9]+]]
+; CHECK: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
+; CHECK-NOT: [[IDX_SHL]]
+; CHECK: s_mov_b32 m0, [[IDX_SHL]]
+; CHECK: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 define void @insertelement_v4f32_or_index(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %idx.in) nounwind {
   %idx.shl = shl i32 %idx.in, 2
   %idx = or i32 %idx.shl, 1
@@ -533,6 +533,41 @@
   ret void
 }
 
+; CHECK-LABEL: {{^}}broken_phi_bb:
+; CHECK: v_mov_b32_e32 [[PHIREG:v[0-9]+]], 8
+
+; CHECK: s_branch [[BB2:BB[0-9]+_[0-9]+]]
+
+; CHECK: {{^BB[0-9]+_[0-9]+}}:
+; CHECK: s_mov_b64 exec,
+
+; CHECK: [[BB2]]:
+; CHECK: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, [[PHIREG]]
+; CHECK: buffer_load_dword
+
+; CHECK: [[REGLOOP:BB[0-9]+_[0-9]+]]:
+; CHECK: v_movreld_b32_e32
+; CHECK: s_cbranch_execnz [[REGLOOP]]
+define void @broken_phi_bb(i32 %arg, i32 %arg1) #0 {
+bb:
+  br label %bb2
+
+bb2:                                              ; preds = %bb4, %bb
+  %tmp = phi i32 [ 8, %bb ], [ %tmp7, %bb4 ]
+  %tmp3 = icmp slt i32 %tmp, %arg
+  br i1 %tmp3, label %bb4, label %bb8
+
+bb4:                                              ; preds = %bb2
+  %vgpr = load volatile i32, i32 addrspace(1)* undef
+  %tmp5 = insertelement <8 x i32> undef, i32 undef, i32 %vgpr
+  %tmp6 = insertelement <8 x i32> %tmp5, i32 %arg1, i32 %vgpr
+  %tmp7 = extractelement <8 x i32> %tmp6, i32 0
+  br label %bb2
+
+bb8:                                              ; preds = %bb2
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 attributes #0 = { nounwind }