Index: lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
===================================================================
--- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -263,6 +263,20 @@
   return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true);
 }
 
+// Find the associated instruction which sets SCC for an MI.
+static MachineInstr *addSCCDependInstr(MachineInstr &MI) {
+  if (!MI.hasRegisterImplicitUseOperand(AMDGPU::SCC))
+    return nullptr;
+
+  MachineBasicBlock::reverse_iterator I = MI, E = MI.getParent()->rend();
+  I++;
+  for (; I != E; ++I)
+    if (I->definesRegister(AMDGPU::SCC))
+      return &*I;
+  assert(0 && "Failed to find carry instr");
+  return nullptr;
+}
+
 // Add MI and its defs to the lists if MI reads one of the defs that are
 // already in the list. Returns true in that case.
 static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs,
@@ -281,6 +295,10 @@
         ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
          (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) &&
           PhysRegUses.count(Use.getReg())))) {
+      // If this MI depends on SCC, find and add defining instr.
+      MachineInstr *Prev = addSCCDependInstr(MI);
+      if (Prev)
+        Insts.push_back(&*Prev);
       Insts.push_back(&MI);
       addDefsUsesToList(MI, RegDefs, PhysRegUses);
       return true;
Index: test/CodeGen/AMDGPU/scc-add-lshl-addc.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/scc-add-lshl-addc.ll
@@ -0,0 +1,64 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa  -mcpu=gfx900 %s -o - | FileCheck -check-prefix=CHECK %s
+
+; CHECK: s_add_u32
+; CHECK: s_addc_u32
+; CHECK: s_add_u32
+; CHECK: s_addc_u32
+; CHECK: s_add_u32
+; CHECK-NOT: s_lshl_b32
+; CHECK: s_addc_u32
+; CHECK: global_load_dword
+
+%0 = type { [32 x %1], [32 x %1*], i32, [32 x i32], i32, [8 x i8] }
+%1 = type { %2, [1024 x %3], [1024 x %3*], %10, [1024 x i32], [1024 x i64], [1024 x i64], [1024 x i64], [1024 x i64] }
+%2 = type { %3, %6, i64, [8 x i8], [64 x %7], [1 x %9] }
+%3 = type { %4, %5, %3* }
+%4 = type { i64, i64, i64, i64, i32 }
+%5 = type { i8, i8, i16, i16, i16, i16, i64 }
+%6 = type { %3 }
+%7 = type { %8*, %8*, i8*, i8*, [16384 x i8] }
+%8 = type { %8*, %8*, i8*, i8*, [0 x i8] }
+%9 = type { %8*, %8*, i8*, i8*, [256 x i8] }
+%10 = type { [1024 x i16] }
+%11 = type <{ [20 x i8*], i8**, i32, [4 x i8] }>
+
+@omptarget_nvptx_device_State = external addrspace(1) externally_initialized global [64 x %0], align 16
+@usedSlotIdx = external local_unnamed_addr addrspace(3) externally_initialized global i32, align 4
+@execution_param = external local_unnamed_addr addrspace(3) externally_initialized global i32, align 4
+@omptarget_nvptx_globalArgs = external addrspace(3) externally_initialized global %11, align 8
+
+define amdgpu_kernel void @__omp_offloading_802_d9e513_main_l28([992 x i32] addrspace(1)* %arg) local_unnamed_addr  {
+bb:
+  %tmp = tail call i64 @__ockl_get_local_size()
+  %tmp1 = trunc i64 %tmp to i32
+  br i1 undef, label %bb2, label %bb3
+
+bb2:                                              ; preds = %bb
+  ret void
+
+bb3:                                              ; preds = %bb
+  %tmp4 = load i32, i32 addrspace(3)* @execution_param, align 4
+  %tmp5 = and i32 %tmp4, 1
+  %tmp6 = icmp eq i32 %tmp5, 0
+  %tmp7 = select i1 %tmp6, i32 0, i32 %tmp1
+  %tmp8 = trunc i32 %tmp7 to i16
+  store i16 %tmp8, i16* undef, align 2
+  %tmp9 = getelementptr inbounds %1, %1* null, i64 0, i32 0, i32 4, i64 0, i32 3
+  store i8* undef, i8** %tmp9, align 8
+  store i8** getelementptr (%11, %11* addrspacecast (%11 addrspace(3)* @omptarget_nvptx_globalArgs to %11*), i64 0, i32 0, i64 0), i8** addrspace(3)* getelementptr inbounds (%11, %11 addrspace(3)* @omptarget_nvptx_globalArgs, i32 0, i32 1), align 8
+  %tmp10 = tail call i32 @llvm.amdgcn.workgroup.id.x()
+  %tmp11 = sext i32 %tmp10 to i64
+  %tmp12 = getelementptr inbounds [992 x i32], [992 x i32] addrspace(1)* %arg, i64 0, i64 %tmp11
+  %tmp13 = load i32, i32 addrspace(1)* %tmp12, align 4
+  %tmp14 = add nsw i32 %tmp13, %tmp10
+  store i32 %tmp14, i32 addrspace(1)* %tmp12, align 4
+  %tmp15 = load i32, i32 addrspace(3)* @usedSlotIdx, align 4
+  %tmp16 = sext i32 %tmp15 to i64
+  %tmp17 = getelementptr inbounds [64 x %0], [64 x %0] addrspace(1)* @omptarget_nvptx_device_State, i64 0, i64 %tmp16, i32 3, i64 undef
+  %tmp18 = addrspacecast i32 addrspace(1)* %tmp17 to i32*
+  %tmp19 = atomicrmw volatile add i32* %tmp18, i32 0 seq_cst
+  unreachable
+}
+
+declare i64 @__ockl_get_local_size() local_unnamed_addr
+declare i32 @llvm.amdgcn.workgroup.id.x()