Index: lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
===================================================================
--- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -296,9 +296,11 @@
   assert(MemOp.mayLoadOrStore());
 
   for (MachineInstr *InstToMove : InstsToMove) {
-    if (!InstToMove->mayLoadOrStore())
-      continue;
-    if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA))
+    if (InstToMove->mayLoadOrStore() &&
+        !memAccessesCanBeReordered(MemOp, *InstToMove, AA))
+      return false;
+
+    if (InstToMove->registerDefIsDead(AMDGPU::SCC))
       return false;
   }
   return true;
Index: test/CodeGen/AMDGPU/merge-load-store.mir
===================================================================
--- test/CodeGen/AMDGPU/merge-load-store.mir
+++ test/CodeGen/AMDGPU/merge-load-store.mir
@@ -59,6 +59,48 @@
   attributes #0 = { convergent nounwind }
   attributes #1 = { convergent nounwind readnone }
 
+  define amdgpu_kernel void @can_move_with_scc([0 x i8] addrspace(6)* %arg) #0 {
+  bb:
+    %main.kernarg.segment = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+    %arg.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %main.kernarg.segment, i64 36
+    %arg.kernarg.offset.cast = bitcast i8 addrspace(4)* %arg.kernarg.offset to [0 x i8] addrspace(6)* addrspace(4)*, !amdgpu.uniform !0, !amdgpu.noclobber !0
+    %arg.load = load [0 x i8] addrspace(6)*, [0 x i8] addrspace(6)* addrspace(4)* %arg.kernarg.offset.cast, align 4, !invariant.load !0
+    %tmp15 = bitcast [0 x i8] addrspace(6)* %arg.load to i8 addrspace(6)*
+    %tmp1 = bitcast i8 addrspace(6)* %tmp15 to <4 x i32> addrspace(6)*, !amdgpu.uniform !0, !amdgpu.noclobber !0
+    %tmp2 = load <4 x i32>, <4 x i32> addrspace(6)* %tmp1, align 16, !invariant.load !0
+    %tmp3 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp2, i32 0, i32 0) #1
+    %tmp4 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp2, i32 4, i32 0) #1
+    %tmp5 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp2, i32 8, i32 0) #1
+    %tmp6 = bitcast float %tmp3 to i32
+    %tmp7 = icmp ne i32 %tmp6, 0
+    %tmp8 = bitcast float %tmp4 to i32
+    %tmp9 = icmp ne i32 %tmp8, 0
+    %tmp10 = bitcast float %tmp5 to i32
+    %tmp11 = icmp ne i32 %tmp10, 0
+    %tmp12 = and i1 %tmp9, %tmp11
+    %tmp13 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp2, i32 12, i32 0) #1
+    %tmp14 = select i1 %tmp12, float %tmp13, float 0.000000e+00
+    br i1 %tmp7, label %bb15, label %bb16, !amdgpu.uniform !0, !structurizecfg.uniform !0
+
+  bb15:                                             ; preds = %bb
+    br label %bb16, !amdgpu.uniform !0, !structurizecfg.uniform !0
+
+  bb16:                                             ; preds = %bb15, %bb
+    %tmp17 = phi float [ 0.000000e+00, %bb15 ], [ %tmp14, %bb ]
+    %tmp18 = fmul float %tmp14, %tmp17
+    %tmp19 = bitcast float %tmp18 to i32
+    store volatile i32 %tmp19, i32 addrspace(1)* undef
+    ret void
+  }
+
+  declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg) #1
+  declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #2
+
+  attributes #0 = { convergent nounwind }
+  attributes #1 = { convergent nounwind readnone }
+  attributes #2 = { convergent nounwind readnone }
+
+  !0 = !{}
 ...
 ---
 name:            mem_dependency
@@ -129,3 +171,51 @@
     S_SETPC_B64_return undef $sgpr30_sgpr31, implicit %6, implicit %7
 
 ...
+---
+# Make sure SCC is still valid if instructions are moved accros memory
+# operations.
+# CHECK-LABEL: name: can_move_with_scc
+# CHECK: S_AND_B64
+# CHECK: S_CMP_EQ_U32
+name:            can_move_with_scc
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '' }
+body:             |
+  bb.0.bb:
+    liveins: $sgpr0_sgpr1
+
+    %3:sgpr_64 = COPY $sgpr0_sgpr1
+    %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %3, 9, 0 :: (dereferenceable invariant load 4 from %ir.arg.kernarg.offset.cast, addrspace 4)
+    %7:sreg_32_xm0 = S_MOV_B32 0
+    %8:sreg_64_xexec = REG_SEQUENCE killed %6, %subreg.sub0, %7, %subreg.sub1
+    %9:sreg_128 = S_LOAD_DWORDX4_IMM killed %8, 0, 0 :: (invariant load 16 from %ir.tmp1, addrspace 6)
+    %10:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %9, 0, 0 :: (dereferenceable invariant load 4)
+    %11:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %9, 1, 0 :: (dereferenceable invariant load 4)
+    %12:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %9, 2, 0 :: (dereferenceable invariant load 4)
+    %13:sreg_64 = V_CMP_NE_U32_e64 killed %11, 0, implicit $exec
+    %15:sreg_64 = V_CMP_NE_U32_e64 killed %12, 0, implicit $exec
+    %17:sreg_64_xexec = S_AND_B64 killed %13, killed %15, implicit-def dead $scc
+    S_CMP_EQ_U32 killed %10, 0, implicit-def $scc
+    %18:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %9, 3, 0 :: (dereferenceable invariant load 4)
+    %21:vgpr_32 = COPY killed %18
+    %0:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %21, killed %17, implicit $exec
+    S_CBRANCH_SCC1 %bb.2, implicit $scc
+    S_BRANCH %bb.1
+
+  bb.1.bb15:
+    %22:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+
+  bb.2.bb16:
+    %1:vgpr_32 = PHI %0, %bb.0, %22, %bb.1
+    %23:vgpr_32 = V_MUL_F32_e64 0, %0, 0, %1, 0, 0, implicit $exec
+    %24:sreg_64 = IMPLICIT_DEF
+    %25:sreg_32_xm0 = COPY %24.sub1
+    %26:sreg_64 = IMPLICIT_DEF
+    %27:sreg_32_xm0 = COPY %26.sub0
+    %28:sreg_32_xm0 = S_MOV_B32 61440
+    %29:sreg_32_xm0 = S_MOV_B32 -1
+    %30:sreg_128 = REG_SEQUENCE killed %27, %subreg.sub0, killed %25, %subreg.sub1, killed %29, %subreg.sub2, killed %28, %subreg.sub3
+    BUFFER_STORE_DWORD_OFFSET killed %23, killed %30, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    S_ENDPGM 0
+...