Index: lib/Target/AMDGPU/SIInstrInfo.h
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.h
+++ lib/Target/AMDGPU/SIInstrInfo.h
@@ -26,6 +26,16 @@
 private:
   const SIRegisterInfo RI;
 
+  // The the inverse predicate should have the negative value.
+  enum BranchPredicate {
+    INVALID_BR = 0,
+    SCC_TRUE = 1,
+    SCC_FALSE = -1
+  };
+
+  static unsigned getBranchOpcode(BranchPredicate Cond);
+  static BranchPredicate getBranchPredicate(unsigned Opcode);
+
   unsigned buildExtractSubReg(MachineBasicBlock::iterator MI,
                               MachineRegisterInfo &MRI,
                               MachineOperand &SuperReg,
@@ -136,6 +146,17 @@
                              unsigned &SrcOpIdx1,
                              unsigned &SrcOpIdx2) const override;
 
+  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
+
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+                        DebugLoc DL) const override;
+
   bool areMemAccessesTriviallyDisjoint(
     MachineInstr *MIa, MachineInstr *MIb,
     AliasAnalysis *AA = nullptr) const override;
@@ -493,7 +514,6 @@
 
   ScheduleHazardRecognizer *
   CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const override;
-
 };
 
 namespace AMDGPU {
Index: lib/Target/AMDGPU/SIInstrInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.cpp
+++ lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1057,6 +1057,115 @@
   return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
 }
 
+unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
+  switch (Cond) {
+  case SIInstrInfo::SCC_TRUE:
+    return AMDGPU::S_CBRANCH_SCC1;
+  case SIInstrInfo::SCC_FALSE:
+    return AMDGPU::S_CBRANCH_SCC0;
+  default:
+    llvm_unreachable("invalid branch predicate");
+  }
+}
+
+SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
+  switch (Opcode) {
+  case AMDGPU::S_CBRANCH_SCC0:
+    return SCC_FALSE;
+  case AMDGPU::S_CBRANCH_SCC1:
+    return SCC_TRUE;
+  default:
+    return INVALID_BR;
+  }
+}
+
+bool SIInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                MachineBasicBlock *&TBB,
+                                MachineBasicBlock *&FBB,
+                                SmallVectorImpl<MachineOperand> &Cond,
+                                bool AllowModify) const {
+  MachineBasicBlock::iterator I = MBB.getFirstTerminator();
+
+  if (I == MBB.end())
+    return false;
+
+  if (I->getOpcode() == AMDGPU::S_BRANCH) {
+    // Unconditional Branch
+    TBB = I->getOperand(0).getMBB();
+    return false;
+  }
+
+  BranchPredicate Pred = getBranchPredicate(I->getOpcode());
+  if (Pred == INVALID_BR)
+    return true;
+
+  MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
+  Cond.push_back(MachineOperand::CreateImm(Pred));
+
+  ++I;
+
+  if (I == MBB.end()) {
+    // Conditional branch followed by fall-through.
+    TBB = CondBB;
+    return false;
+  }
+
+  if (I->getOpcode() == AMDGPU::S_BRANCH) {
+    TBB = CondBB;
+    FBB = I->getOperand(0).getMBB();
+    return false;
+  }
+
+  return true;
+}
+
+unsigned SIInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.getFirstTerminator();
+
+  unsigned Count = 0;
+  while (I != MBB.end()) {
+    MachineBasicBlock::iterator Next = std::next(I);
+    I->eraseFromParent();
+    ++Count;
+    I = Next;
+  }
+
+  return Count;
+}
+
+unsigned SIInstrInfo::InsertBranch(MachineBasicBlock &MBB,
+                                   MachineBasicBlock *TBB,
+                                   MachineBasicBlock *FBB,
+                                   ArrayRef<MachineOperand> Cond,
+                                   DebugLoc DL) const {
+
+  if (!FBB && Cond.empty()) {
+    BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
+      .addMBB(TBB);
+    return 1;
+  }
+
+  assert(TBB && Cond[0].isImm());
+
+  unsigned Opcode
+    = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
+
+  if (!FBB) {
+    BuildMI(&MBB, DL, get(Opcode))
+      .addMBB(TBB);
+    return 1;
+  }
+
+  assert(TBB && FBB);
+
+  BuildMI(&MBB, DL, get(Opcode))
+    .addMBB(TBB);
+  BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
+    .addMBB(FBB);
+
+  return 2;
+}
+
 static void removeModOperands(MachineInstr &MI) {
   unsigned Opc = MI.getOpcode();
   int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
Index: test/CodeGen/AMDGPU/ctpop64.ll
===================================================================
--- test/CodeGen/AMDGPU/ctpop64.ll
+++ test/CodeGen/AMDGPU/ctpop64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
 
 declare i64 @llvm.ctpop.i64(i64) nounwind readnone
@@ -110,12 +110,9 @@
   ret void
 }
 
-; FIXME: We currently disallow SALU instructions in all branches,
-; but there are some cases when the should be allowed.
-
 ; FUNC-LABEL: {{^}}ctpop_i64_in_br:
-; SI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xd
-; VI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x34
+; SI-DAG: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xd
+; VI-DAG: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x34
 ; GCN-DAG: s_bcnt1_i32_b64 [[RESULT:s[0-9]+]], {{s\[}}[[LOVAL]]:[[HIVAL]]{{\]}}
 ; GCN-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[RESULT]]
Index: test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
===================================================================
--- test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
+++ test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
@@ -3,19 +3,19 @@
 ; Make sure that m0 is not reinitialized in the loop.
 
 ; GCN-LABEL: {{^}}copy_local_to_global_loop_m0_init:
-; GCN: s_cbranch_scc1 BB0_2
+; GCN: s_cbranch_scc1 BB0_3
 
 ; Initialize in preheader
 ; GCN: s_mov_b32 m0, -1
 
-; GCN: BB0_3:
+; GCN: BB0_2:
 ; GCN: ds_read_b32
 ; GCN: buffer_store_dword
 
-; GCN: s_cbranch_vccnz BB0_2
-; GCN: s_branch BB0_3
+; GCN: s_cbranch_vccnz BB0_3
+; GCN: s_branch BB0_2
 
-; GCN: BB0_2:
+; GCN: BB0_3:
 ; GCN-NEXT: s_endpgm
 define void @copy_local_to_global_loop_m0_init(i32 addrspace(1)* noalias nocapture %out, i32 addrspace(3)* noalias nocapture readonly %in, i32 %n) #0 {
 bb:
Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=fiji -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 declare void @llvm.amdgcn.s.dcache.inv() #0
+declare void @llvm.amdgcn.s.waitcnt(i32) #0
 
 ; GCN-LABEL: {{^}}test_s_dcache_inv:
 ; GCN-NEXT: ; BB#0:
@@ -15,10 +16,11 @@
 
 ; GCN-LABEL: {{^}}test_s_dcache_inv_insert_wait:
 ; GCN-NEXT: ; BB#0:
-; GCN-NEXT: s_dcache_inv
-; GCN-NEXT: s_waitcnt lgkmcnt(0) ; encoding
+; GCN: s_dcache_inv
+; GCN: s_waitcnt lgkmcnt(0) ; encoding
 define void @test_s_dcache_inv_insert_wait() #0 {
   call void @llvm.amdgcn.s.dcache.inv()
+  call void @llvm.amdgcn.s.waitcnt(i32 0)
   br label %end
 
 end:
Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 declare void @llvm.amdgcn.s.dcache.inv.vol() #0
+declare void @llvm.amdgcn.s.waitcnt(i32) #0
 
 ; GCN-LABEL: {{^}}test_s_dcache_inv_vol:
 ; GCN-NEXT: ; BB#0:
@@ -16,9 +17,10 @@
 ; GCN-LABEL: {{^}}test_s_dcache_inv_vol_insert_wait:
 ; GCN-NEXT: ; BB#0:
 ; GCN-NEXT: s_dcache_inv_vol
-; GCN-NEXT: s_waitcnt lgkmcnt(0) ; encoding
+; GCN: s_waitcnt lgkmcnt(0) ; encoding
 define void @test_s_dcache_inv_vol_insert_wait() #0 {
   call void @llvm.amdgcn.s.dcache.inv.vol()
+  call void @llvm.amdgcn.s.waitcnt(i32 0)
   br label %end
 
 end:
Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
@@ -1,6 +1,7 @@
 ; RUN: llc -march=amdgcn -mcpu=fiji -show-mc-encoding < %s | FileCheck -check-prefix=VI %s
 
 declare void @llvm.amdgcn.s.dcache.wb() #0
+declare void @llvm.amdgcn.s.waitcnt(i32) #0
 
 ; VI-LABEL: {{^}}test_s_dcache_wb:
 ; VI-NEXT: ; BB#0:
@@ -14,9 +15,10 @@
 ; VI-LABEL: {{^}}test_s_dcache_wb_insert_wait:
 ; VI-NEXT: ; BB#0:
 ; VI-NEXT: s_dcache_wb
-; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding
+; VI: s_waitcnt lgkmcnt(0) ; encoding
 define void @test_s_dcache_wb_insert_wait() #0 {
   call void @llvm.amdgcn.s.dcache.wb()
+  call void @llvm.amdgcn.s.waitcnt(i32 0)
   br label %end
 
 end:
Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
@@ -1,6 +1,7 @@
 ; RUN: llc -march=amdgcn -mcpu=fiji -show-mc-encoding < %s | FileCheck -check-prefix=VI %s
 
 declare void @llvm.amdgcn.s.dcache.wb.vol() #0
+declare void @llvm.amdgcn.s.waitcnt(i32) #0
 
 ; VI-LABEL: {{^}}test_s_dcache_wb_vol:
 ; VI-NEXT: ; BB#0:
@@ -14,9 +15,10 @@
 ; VI-LABEL: {{^}}test_s_dcache_wb_vol_insert_wait:
 ; VI-NEXT: ; BB#0:
 ; VI-NEXT: s_dcache_wb_vol
-; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding
+; VI: s_waitcnt lgkmcnt(0) ; encoding
 define void @test_s_dcache_wb_vol_insert_wait() #0 {
   call void @llvm.amdgcn.s.dcache.wb.vol()
+  call void @llvm.amdgcn.s.waitcnt(i32 0)
   br label %end
 
 end:
Index: test/CodeGen/AMDGPU/sgpr-control-flow.ll
===================================================================
--- test/CodeGen/AMDGPU/sgpr-control-flow.ll
+++ test/CodeGen/AMDGPU/sgpr-control-flow.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 ;
 ;
 ; Most SALU instructions ignore control flow, so we need to make sure
@@ -67,7 +67,7 @@
 ; SI: v_cmp_gt_i32_e32 [[CMP_IF:vcc]], 0, [[AVAL]]
 ; SI: v_cndmask_b32_e64 [[V_CMP:v[0-9]+]], 0, -1, [[CMP_IF]]
 
-; SI: BB2_1:
+; SI: BB2_2:
 ; SI: buffer_load_dword [[AVAL:v[0-9]+]]
 ; SI: v_cmp_eq_i32_e32 [[CMP_ELSE:vcc]], 0, [[AVAL]]
 ; SI: v_cndmask_b32_e64 [[V_CMP]], 0, -1, [[CMP_ELSE]]
Index: test/CodeGen/AMDGPU/uniform-cfg.ll
===================================================================
--- test/CodeGen/AMDGPU/uniform-cfg.ll
+++ test/CodeGen/AMDGPU/uniform-cfg.ll
@@ -166,31 +166,70 @@
 }
 
 
-; SI-LABEL: {{^}}uniform_if_else:
+; SI-LABEL: {{^}}uniform_if_else_ret:
 ; SI: s_cmp_lg_i32 s{{[0-9]+}}, 0
-; SI: s_cbranch_scc1 [[ELSE_LABEL:[0-9_A-Za-z]+]]
+; SI-NEXT: s_cbranch_scc1 [[ELSE_LABEL:[0-9_A-Za-z]+]]
+; SI-NEXT: s_branch [[IF_LABEL:[0-9_A-Za-z]+]]
+
+; SI: [[ELSE_LABEL]]:
+; SI: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
+; SI: buffer_store_dword [[TWO]]
+; SI: s_endpgm
+
+; SI: {{^}}[[IF_LABEL]]:
 ; SI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; SI: buffer_store_dword [[ONE]]
-; SI: s_branch [[ENDIF_LABEL:[0-9_A-Za-z]+]]
+; SI: s_endpgm
+define void @uniform_if_else_ret(i32 addrspace(1)* nocapture %out, i32 %a) {
+entry:
+  %cmp = icmp eq i32 %a, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32 addrspace(1)* %out
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  store i32 2, i32 addrspace(1)* %out
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+; SI-LABEL: {{^}}uniform_if_else:
+; SI: s_cmp_lg_i32 s{{[0-9]+}}, 0
+; SI-NEXT: s_cbranch_scc1 [[ELSE_LABEL:[0-9_A-Za-z]+]]
+; SI-NEXT: s_branch [[IF_LABEL:[0-9_A-Za-z]+]]
+
 ; SI: [[ELSE_LABEL]]:
 ; SI: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
 ; SI: buffer_store_dword [[TWO]]
+; SI: s_branch [[ENDIF_LABEL:[0-9_A-Za-z]+]]
+
+; SI: [[IF_LABEL]]:
+; SI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
+; SI: buffer_store_dword [[ONE]]
+
 ; SI: [[ENDIF_LABEL]]:
+; SI: v_mov_b32_e32 [[THREE:v[0-9]+]], 3
+; SI: buffer_store_dword [[THREE]]
 ; SI: s_endpgm
-define void @uniform_if_else(i32 addrspace(1)* nocapture %out, i32 %a) {
+define void @uniform_if_else(i32 addrspace(1)* nocapture %out0, i32 addrspace(1)* nocapture %out1, i32 %a) {
 entry:
   %cmp = icmp eq i32 %a, 0
   br i1 %cmp, label %if.then, label %if.else
 
 if.then:                                          ; preds = %entry
-  store i32 1, i32 addrspace(1)* %out
+  store i32 1, i32 addrspace(1)* %out0
   br label %if.end
 
 if.else:                                          ; preds = %entry
-  store i32 2, i32 addrspace(1)* %out
+  store i32 2, i32 addrspace(1)* %out0
   br label %if.end
 
 if.end:                                           ; preds = %if.else, %if.then
+  store i32 3, i32 addrspace(1)* %out1
   ret void
 }
 
@@ -368,15 +407,15 @@
 ; SI-LABEL: {{^}}cse_uniform_condition_different_blocks:
 ; SI: s_load_dword [[COND:s[0-9]+]]
 ; SI: s_cmp_lt_i32 [[COND]], 1
-; SI: s_cbranch_scc1 BB13_3
+; SI: s_cbranch_scc1 BB[[FNNUM:[0-9]+]]_3
 
 ; SI: BB#1:
 ; SI-NOT: cmp
 ; SI: buffer_load_dword
 ; SI: buffer_store_dword
-; SI: s_cbranch_scc1 BB13_3
+; SI: s_cbranch_scc1 BB[[FNNUM]]_3
 
-; SI: BB13_3:
+; SI: BB[[FNNUM]]_3:
 ; SI: s_endpgm
 define void @cse_uniform_condition_different_blocks(i32 %cond, i32 addrspace(1)* %out) {
 bb:
Index: test/CodeGen/AMDGPU/valu-i1.ll
===================================================================
--- test/CodeGen/AMDGPU/valu-i1.ll
+++ test/CodeGen/AMDGPU/valu-i1.ll
@@ -72,19 +72,18 @@
 ; SI: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}}
 ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
 ; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
-; SI: s_cbranch_execz BB2_2
+; SI: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]]
 
-; SI: ; BB#1:
 ; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
 
-; SI: BB2_3:
+; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]:
 ; SI: buffer_load_dword
 ; SI-DAG: buffer_store_dword
 ; SI-DAG: v_cmp_eq_i32_e32 vcc,
 ; SI-DAG: s_and_b64 vcc, exec, vcc
-; SI: s_cbranch_vccnz BB2_2
-; SI: s_branch BB2_3
-; SI: BB2_2:
+; SI: s_cbranch_vccnz [[LABEL_EXIT]]
+; SI: s_branch [[LABEL_LOOP]]
+; SI: [[LABEL_EXIT]]:
 ; SI: s_endpgm
 
 define void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
@@ -117,7 +116,7 @@
 ; SI: v_cmp_lt_i32_e32 vcc
 ; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc
 ; SI: s_xor_b64 [[OUTER_CMP_SREG]], exec, [[OUTER_CMP_SREG]]
-; SI: s_cbranch_execz BB3_2
+; SI: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]]
 
 ; Initialize inner condition to false
 ; SI: ; BB#1:
@@ -125,7 +124,7 @@
 ; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], [[ZERO]]
 
 ; Clear exec bits for workitems that load -1s
-; SI: BB3_3:
+; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]:
 ; SI: buffer_load_dword [[B:v[0-9]+]]
 ; SI: buffer_load_dword [[A:v[0-9]+]]
 ; SI-DAG: v_cmp_ne_i32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], -1, [[A]]
@@ -133,23 +132,23 @@
 ; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]]
 ; SI: s_and_saveexec_b64 [[ORNEG2:s\[[0-9]+:[0-9]+\]]], [[ORNEG1]]
 ; SI: s_xor_b64 [[ORNEG2]], exec, [[ORNEG2]]
-; SI: s_cbranch_execz BB3_5
+; SI: s_cbranch_execz [[LABEL_FLOW:BB[0-9]+_[0-9]+]]
 
-; SI: BB#4:
+; SI: BB#3:
 ; SI: buffer_store_dword
 ; SI: v_cmp_ge_i64_e32 [[CMP:s\[[0-9]+:[0-9]+\]|vcc]]
 ; SI: s_or_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP]], [[COND_STATE]]
 
-; SI: BB3_5:
+; SI: [[LABEL_FLOW]]:
 ; SI: s_or_b64 exec, exec, [[ORNEG2]]
 ; SI: s_or_b64 [[COND_STATE]], [[ORNEG2]], [[TMP]]
 ; SI: s_andn2_b64 exec, exec, [[COND_STATE]]
-; SI: s_cbranch_execnz BB3_3
+; SI: s_cbranch_execnz [[LABEL_LOOP]]
 
-; SI: BB#6
+; SI: BB#5
 ; SI: s_or_b64 exec, exec, [[COND_STATE]]
 
-; SI: BB3_2:
+; SI: [[LABEL_EXIT]]:
 ; SI-NOT: [[COND_STATE]]
 ; SI: s_endpgm