Index: lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
===================================================================
--- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -70,6 +70,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -193,6 +194,13 @@
                                   MemInfoMap &Visited,
                                   SmallPtrSet<MachineInstr *, 4> &Promoted);
 
+  // used to extend addToListsIfDependent to express Bailing.
+  enum AddToStat {AddToTrue, AddToFalse, AddToBail };
+  AddToStat addToListsIfDependent(MachineInstr &MI,
+                                  DenseSet<unsigned> &RegDefs,
+                                      DenseSet<unsigned> &PhysRegUses,
+                                      SmallVectorImpl<MachineInstr *> &Insts);
+
 public:
   static char ID;
 
@@ -263,11 +271,45 @@
   return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true);
 }
 
+static void getLiveRegsAt(LivePhysRegs &Regs, const MachineInstr &MI) {
+  const MachineBasicBlock &B = *MI.getParent();
+  Regs.addLiveOuts(B);
+  auto E = ++MachineBasicBlock::const_iterator(MI.getIterator()).getReverse();
+  for (auto I = B.rbegin(); I != E; ++I)
+    Regs.stepBackward(*I);
+}
+
+// Get the adjacent instruction which defines physical Reg used by this MI.
+static MachineInstr *getPhysRegAdjacentInstr(MachineInstr &MI, unsigned Reg,
+                                             const SIRegisterInfo *TRI,
+                                             MachineRegisterInfo *MRI) {
+  if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+    return nullptr;
+  // if Reg available at MI, then reg is not live.
+  LivePhysRegs LiveAtMI(*TRI);
+  getLiveRegsAt(LiveAtMI, MI);
+  if (LiveAtMI.available(*MRI, Reg))
+    return nullptr;
+  // Only look at previous instruction for the defining instr.
+  MachineBasicBlock::reverse_iterator I = MI;
+  I++;
+  // If Reg is not available at I, then reg is not live.
+  getLiveRegsAt(LiveAtMI, *I);
+  if (!LiveAtMI.available(*MRI, Reg))
+    return nullptr;
+  // Reg is live, does this instr define it?
+  if (I->definesRegister(Reg))
+    return &*I;
+  return nullptr;
+}
+
 // Add MI and its defs to the lists if MI reads one of the defs that are
 // already in the list. Returns true in that case.
-static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs,
-                                  DenseSet<unsigned> &PhysRegUses,
-                                  SmallVectorImpl<MachineInstr *> &Insts) {
+SILoadStoreOptimizer::AddToStat SILoadStoreOptimizer::addToListsIfDependent(
+  MachineInstr &MI, DenseSet<unsigned> &RegDefs,
+  DenseSet<unsigned> &PhysRegUses,
+  SmallVectorImpl<MachineInstr *> &Insts) {
+
   for (MachineOperand &Use : MI.operands()) {
     // If one of the defs is read, then there is a use of Def between I and the
     // instruction that I will potentially be merged with. We will need to move
@@ -281,13 +323,24 @@
         ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
          (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) &&
           PhysRegUses.count(Use.getReg())))) {
+      // If this MI depends on a physReg such as SCC, find and add defining
+      // instr. If not found, bail on this optimization.
+      if (Use.isImplicit() &&
+          TargetRegisterInfo::isPhysicalRegister(Use.getReg())) {
+        MachineInstr *Prev = getPhysRegAdjacentInstr(MI, Use.getReg(),
+                                                     TRI, MRI);
+        if (Prev)
+          Insts.push_back(&*Prev);
+        else
+          return AddToBail;
+      }
       Insts.push_back(&MI);
       addDefsUsesToList(MI, RegDefs, PhysRegUses);
-      return true;
+      return AddToTrue;
     }
   }
 
-  return false;
+  return AddToFalse;
 }
 
 static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
@@ -577,8 +630,11 @@
       // When we match I with another DS instruction we will be moving I down
       // to the location of the matched instruction any uses of I will need to
       // be moved down as well.
-      addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
-                            CI.InstsToMove);
+      AddToStat AStat = addToListsIfDependent(*MBBI, RegDefsToMove,
+                                              PhysRegUsesToMove,
+                                              CI.InstsToMove);
+      if (AStat == AddToBail)
+        return false;
       continue;
     }
 
@@ -592,9 +648,13 @@
     //   DS_WRITE_B32 addr, f(w), idx1
     // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
     // merging of the two writes.
-    if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
-                              CI.InstsToMove))
+    AddToStat AStat = addToListsIfDependent(*MBBI, RegDefsToMove,
+                                            PhysRegUsesToMove,
+                                            CI.InstsToMove);
+    if (AStat == AddToTrue)
       continue;
+    if (AStat == AddToBail)
+      return false;
 
     bool Match = true;
     for (unsigned i = 0; i < NumAddresses; i++) {
Index: test/CodeGen/AMDGPU/scc-add-lshl-addc.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/scc-add-lshl-addc.ll
@@ -0,0 +1,64 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa  -mcpu=gfx900 %s -o - | FileCheck -check-prefix=CHECK %s
+
+; CHECK: s_add_u32
+; CHECK: s_addc_u32
+; CHECK: s_add_u32
+; CHECK: s_addc_u32
+; CHECK: s_add_u32
+; CHECK-NOT: s_lshl_b32
+; CHECK: s_addc_u32
+; CHECK: global_load_dword
+
+%0 = type { [32 x %1], [32 x %1*], i32, [32 x i32], i32, [8 x i8] }
+%1 = type { %2, [1024 x %3], [1024 x %3*], %10, [1024 x i32], [1024 x i64], [1024 x i64], [1024 x i64], [1024 x i64] }
+%2 = type { %3, %6, i64, [8 x i8], [64 x %7], [1 x %9] }
+%3 = type { %4, %5, %3* }
+%4 = type { i64, i64, i64, i64, i32 }
+%5 = type { i8, i8, i16, i16, i16, i16, i64 }
+%6 = type { %3 }
+%7 = type { %8*, %8*, i8*, i8*, [16384 x i8] }
+%8 = type { %8*, %8*, i8*, i8*, [0 x i8] }
+%9 = type { %8*, %8*, i8*, i8*, [256 x i8] }
+%10 = type { [1024 x i16] }
+%11 = type <{ [20 x i8*], i8**, i32, [4 x i8] }>
+
+@omptarget_nvptx_device_State = external addrspace(1) externally_initialized global [64 x %0], align 16
+@usedSlotIdx = external local_unnamed_addr addrspace(3) externally_initialized global i32, align 4
+@execution_param = external local_unnamed_addr addrspace(3) externally_initialized global i32, align 4
+@omptarget_nvptx_globalArgs = external addrspace(3) externally_initialized global %11, align 8
+
+define amdgpu_kernel void @__omp_offloading_802_d9e513_main_l28([992 x i32] addrspace(1)* %arg) local_unnamed_addr  {
+bb:
+  %tmp = tail call i64 @__ockl_get_local_size()
+  %tmp1 = trunc i64 %tmp to i32
+  br i1 undef, label %bb2, label %bb3
+
+bb2:                                              ; preds = %bb
+  ret void
+
+bb3:                                              ; preds = %bb
+  %tmp4 = load i32, i32 addrspace(3)* @execution_param, align 4
+  %tmp5 = and i32 %tmp4, 1
+  %tmp6 = icmp eq i32 %tmp5, 0
+  %tmp7 = select i1 %tmp6, i32 0, i32 %tmp1
+  %tmp8 = trunc i32 %tmp7 to i16
+  store i16 %tmp8, i16* undef, align 2
+  %tmp9 = getelementptr inbounds %1, %1* null, i64 0, i32 0, i32 4, i64 0, i32 3
+  store i8* undef, i8** %tmp9, align 8
+  store i8** getelementptr (%11, %11* addrspacecast (%11 addrspace(3)* @omptarget_nvptx_globalArgs to %11*), i64 0, i32 0, i64 0), i8** addrspace(3)* getelementptr inbounds (%11, %11 addrspace(3)* @omptarget_nvptx_globalArgs, i32 0, i32 1), align 8
+  %tmp10 = tail call i32 @llvm.amdgcn.workgroup.id.x()
+  %tmp11 = sext i32 %tmp10 to i64
+  %tmp12 = getelementptr inbounds [992 x i32], [992 x i32] addrspace(1)* %arg, i64 0, i64 %tmp11
+  %tmp13 = load i32, i32 addrspace(1)* %tmp12, align 4
+  %tmp14 = add nsw i32 %tmp13, %tmp10
+  store i32 %tmp14, i32 addrspace(1)* %tmp12, align 4
+  %tmp15 = load i32, i32 addrspace(3)* @usedSlotIdx, align 4
+  %tmp16 = sext i32 %tmp15 to i64
+  %tmp17 = getelementptr inbounds [64 x %0], [64 x %0] addrspace(1)* @omptarget_nvptx_device_State, i64 0, i64 %tmp16, i32 3, i64 undef
+  %tmp18 = addrspacecast i32 addrspace(1)* %tmp17 to i32*
+  %tmp19 = atomicrmw volatile add i32* %tmp18, i32 0 seq_cst
+  unreachable
+}
+
+declare i64 @__ockl_get_local_size() local_unnamed_addr
+declare i32 @llvm.amdgcn.workgroup.id.x()
Index: test/CodeGen/AMDGPU/scc-missing-add.mir
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/scc-missing-add.mir
@@ -0,0 +1,159 @@
+# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=si-load-store-opt -o - %s | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass=si-load-store-opt -o - %s | FileCheck -check-prefix=GFX9 %s
+
+# This test presents a sequnce of DS_READ instructions that could be combined
+# into a single DS_READ provided all the dependent instructions are correctly
+# identified and moved. In this situation an S_ADDC depends on an S_ADD,
+# however the S_ADD is further away than 10 instructions and will not be found.
+# The SILoadStoreOptimizer pass needs to detect the S_ADD was not found and
+# abandon the transformation.
+
+# GFX9-LABEL: name:            __omp_offloading_802_d9e513_main_l28
+# GFX9: DS_READ
+# GFX9: DS_WRITE
+# GFX9: S_ADD
+# GFX9: S_ADDC
+# GFX9: GLOBAL_LOAD_DWORD
+# GFX9: GLOBAL_STORE_DWORD
+# GFX9: DS_READ
+
+--- |
+
+  %0 = type { [32 x %1], [32 x %1*], i32, [32 x i32], i32, [8 x i8] }
+  %1 = type { %2, [1024 x %3], [1024 x %3*], %10, [1024 x i32], [1024 x i64], [1024 x i64], [1024 x i64], [1024 x i64] }
+  %2 = type { %3, %6, i64, [8 x i8], [64 x %7], [1 x %9] }
+  %3 = type { %4, %5, %3* }
+  %4 = type { i64, i64, i64, i64, i32 }
+  %5 = type { i8, i8, i16, i16, i16, i16, i64 }
+  %6 = type { %3 }
+  %7 = type { %8*, %8*, i8*, i8*, [16384 x i8] }
+  %8 = type { %8*, %8*, i8*, i8*, [0 x i8] }
+  %9 = type { %8*, %8*, i8*, i8*, [256 x i8] }
+  %10 = type { [1024 x i16] }
+  %11 = type <{ [20 x i8*], i8**, i32, [4 x i8] }>
+
+  @omptarget_nvptx_device_State = external addrspace(1) externally_initialized global [64 x %0], align 16
+  @usedSlotIdx = external local_unnamed_addr addrspace(3) externally_initialized global i32, align 4
+  @execution_param = external local_unnamed_addr addrspace(3) externally_initialized global i32, align 4
+  @omptarget_nvptx_globalArgs = external addrspace(3) externally_initialized global %11, align 8
+
+  define amdgpu_kernel void @__omp_offloading_802_d9e513_main_l28([992 x i32] addrspace(1)* %arg) local_unnamed_addr #0 {
+  bb:
+    %tmp = tail call i64 @__ockl_get_local_size()
+    br i1 undef, label %bb2, label %bb3, !amdgpu.uniform !0
+
+  bb2:                                              ; preds = %bb
+    ret void
+
+  bb3:                                              ; preds = %bb
+    %__omp_offloading_802_d9e513_main_l28.kernarg.segment = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+    %arg.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %__omp_offloading_802_d9e513_main_l28.kernarg.segment, i64 36
+    %arg.kernarg.offset.cast = bitcast i8 addrspace(4)* %arg.kernarg.offset to [992 x i32] addrspace(1)* addrspace(4)*, !amdgpu.uniform !0, !amdgpu.noclobber !0
+    %arg.load = load [992 x i32] addrspace(1)*, [992 x i32] addrspace(1)* addrspace(4)* %arg.kernarg.offset.cast, align 4, !invariant.load !0
+    %tmp1 = trunc i64 %tmp to i32
+    %tmp4 = load i32, i32 addrspace(3)* @execution_param, align 4
+    %tmp5 = and i32 %tmp4, 1
+    %tmp6 = icmp eq i32 %tmp5, 0
+    %tmp7 = select i1 %tmp6, i32 0, i32 %tmp1
+    %tmp8 = trunc i32 %tmp7 to i16
+    store i16 %tmp8, i16* undef, align 2
+    store i8* undef, i8** inttoptr (i64 184 to i8**), align 8
+    store i8** getelementptr (%11, %11* addrspacecast (%11 addrspace(3)* @omptarget_nvptx_globalArgs to %11*), i64 0, i32 0, i64 0), i8** addrspace(3)* getelementptr inbounds (%11, %11 addrspace(3)* @omptarget_nvptx_globalArgs, i32 0, i32 1), align 8
+    %tmp10 = tail call i32 @llvm.amdgcn.workgroup.id.x()
+    %tmp11 = sext i32 %tmp10 to i64
+    %tmp12 = getelementptr inbounds [992 x i32], [992 x i32] addrspace(1)* %arg.load, i64 0, i64 %tmp11, !amdgpu.uniform !0
+    %tmp13 = load i32, i32 addrspace(1)* %tmp12, align 4
+    %tmp14 = add nsw i32 %tmp13, %tmp10
+    store i32 %tmp14, i32 addrspace(1)* %tmp12, align 4
+    %tmp15 = load i32, i32 addrspace(3)* @usedSlotIdx, align 4
+    %tmp16 = sext i32 %tmp15 to i64
+    %tmp17 = getelementptr inbounds [64 x %0], [64 x %0] addrspace(1)* @omptarget_nvptx_device_State, i64 0, i64 %tmp16, i32 3, i64 undef
+    %0 = addrspacecast i32 addrspace(1)* %tmp17 to i32*
+    %tmp19 = atomicrmw volatile add i32* %0, i32 0 seq_cst
+    unreachable
+  }
+
+  declare i64 @__ockl_get_local_size() local_unnamed_addr
+  declare i32 @llvm.amdgcn.workgroup.id.x()
+  declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+
+  !0 = !{}
+
+...
+---
+name:            __omp_offloading_802_d9e513_main_l28
+body:             |
+  bb.0.bb:
+    successors: %bb.1(0x7fffffff), %bb.2(0x00000001)
+    liveins: $sgpr0_sgpr1, $sgpr2
+
+    %3:sreg_32_xm0 = COPY $sgpr2
+    %2:sgpr_64 = COPY $sgpr0_sgpr1
+    ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr101
+    %5:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @__ockl_get_local_size + 4, target-flags(amdgpu-gotprel32-hi) @__ockl_get_local_size + 4, implicit-def dead $scc
+    %6:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed %5, 0, 0 :: (dereferenceable invariant load 8 from got, addrspace 4)
+    %7:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
+    %8:sreg_32_xm0 = COPY $sgpr101
+    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %7
+    $sgpr4 = COPY %8
+    $sgpr30_sgpr31 = SI_CALL killed %6, @__ockl_get_local_size, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit-def $vgpr0_vgpr1
+    ADJCALLSTACKDOWN 0, 4, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr101
+    %53:vreg_64 = COPY $vgpr0_vgpr1
+    S_CBRANCH_SCC1 %bb.2, implicit undef $scc
+    S_BRANCH %bb.1
+
+  bb.1.bb2:
+    S_ENDPGM 0
+
+  bb.2.bb3:
+    %10:sreg_64_xexec = S_LOAD_DWORDX2_IMM %2, 36, 0 :: (dereferenceable invariant load 8 from %ir.arg.kernarg.offset.cast, align 4, addrspace 4)
+    %12:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %13:vgpr_32 = DS_READ_B32_gfx9 %12, 184, 0, implicit $exec :: (dereferenceable load 4 from @execution_param, addrspace 3)
+    %55:vgpr_32 = V_BFE_I32 %13, 0, 1, implicit $exec
+    %16:vgpr_32 = V_AND_B32_e32 killed %55, %53.sub0, implicit $exec
+    %18:sreg_64 = IMPLICIT_DEF
+    %19:vreg_64 = COPY %18
+    FLAT_STORE_SHORT killed %19, killed %16, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 2 into `i16* undef`)
+    %20:sreg_32_xm0 = S_GETREG_B32 31759
+    %21:sreg_32_xm0 = S_LSHL_B32 killed %20, 16, implicit-def dead $scc
+    %56:vgpr_32 = V_MOV_B32_e32 8, implicit $exec
+    %57:vgpr_32 = COPY killed %21
+    %24:vreg_64 = REG_SEQUENCE killed %56, %subreg.sub0, killed %57, %subreg.sub1
+    DS_WRITE_B64_gfx9 %12, killed %24, 168, 0, implicit $exec :: (store 8 into `i8** addrspace(3)* getelementptr inbounds (%11, %11 addrspace(3)* @omptarget_nvptx_globalArgs, i32 0, i32 1)`, addrspace 3)
+    %25:sreg_32_xm0 = S_ASHR_I32 %3, 31, implicit-def dead $scc
+    %27:sreg_64 = REG_SEQUENCE %3, %subreg.sub0, %25, %subreg.sub1
+    %29:sreg_64 = S_LSHL_B64 killed %27, 2, implicit-def dead $scc
+    %69:sreg_32_xm0 = S_ADD_U32 %10.sub0, %29.sub0, implicit-def $scc
+    %150:vgpr_32 = COPY killed %21
+    %151:vgpr_32 = COPY killed %21
+    %152:vgpr_32 = COPY killed %21
+    %153:vgpr_32 = COPY killed %21
+    %154:vgpr_32 = COPY killed %21
+    %155:vgpr_32 = COPY killed %21
+    %156:vgpr_32 = COPY killed %21
+    %157:vgpr_32 = COPY killed %21
+    %158:vgpr_32 = COPY killed %21
+    %159:vgpr_32 = COPY killed %21
+    %160:vgpr_32 = COPY killed %21
+    %70:sreg_32_xm0 = S_ADDC_U32 %10.sub1, %29.sub1, implicit-def $scc, implicit $scc
+    %30:sreg_64 = REG_SEQUENCE %69, %subreg.sub0, %70, %subreg.sub1
+    %130:sreg_64 = REG_SEQUENCE %160, %157
+    %131:sreg_64 = REG_SEQUENCE %158, %159
+    %32:vreg_64 = COPY %30
+    %31:vgpr_32 = GLOBAL_LOAD_DWORD %32, 0, 0, 0, implicit $exec :: (load 4 from %ir.tmp12, addrspace 1)
+    %58:vgpr_32 = nsw V_ADD_U32_e64 %31, %3, 0, implicit $exec
+    GLOBAL_STORE_DWORD %32, %58, 0, 0, 0, implicit $exec :: (store 4 into %ir.tmp12, addrspace 1)
+    %37:vgpr_32 = DS_READ_B32_gfx9 %12, 0, 0, implicit $exec :: (dereferenceable load 4 from @usedSlotIdx, addrspace 3)
+    %38:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @omptarget_nvptx_device_State + 4, target-flags(amdgpu-gotprel32-hi) @omptarget_nvptx_device_State + 4, implicit-def dead $scc
+    %39:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed %38, 0, 0 :: (dereferenceable invariant load 8 from got, addrspace 4)
+    %40:sreg_32_xm0 = S_MOV_B32 37501328
+    %43:vreg_64 = COPY killed %39
+    %41:vreg_64, %42:sreg_64 = V_MAD_I64_I32 killed %37, killed %40, %43, 0, implicit $exec
+    %65:sgpr_32 = S_MOV_B32 37501188
+    %60:vgpr_32 = V_ADD_I32_e32 %65, %41.sub0, implicit-def $vcc, implicit $exec
+    %62:sreg_64_xexec = COPY killed $vcc
+    %61:vgpr_32, dead %63:sreg_64_xexec = V_ADDC_U32_e64 %41.sub1, 0, killed %62, 0, implicit $exec
+    %59:vreg_64 = REG_SEQUENCE %60, %subreg.sub0, %61, %subreg.sub1
+    %52:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    FLAT_ATOMIC_ADD %59, %52, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst 4 on %ir.0)
+...