Index: lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -296,9 +296,11 @@ assert(MemOp.mayLoadOrStore()); for (MachineInstr *InstToMove : InstsToMove) { - if (!InstToMove->mayLoadOrStore()) - continue; - if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA)) + if (InstToMove->mayLoadOrStore() && + !memAccessesCanBeReordered(MemOp, *InstToMove, AA)) + return false; + + if (InstToMove->registerDefIsDead(AMDGPU::SCC)) return false; } return true; Index: test/CodeGen/AMDGPU/merge-load-store.mir =================================================================== --- test/CodeGen/AMDGPU/merge-load-store.mir +++ test/CodeGen/AMDGPU/merge-load-store.mir @@ -59,6 +59,48 @@ attributes #0 = { convergent nounwind } attributes #1 = { convergent nounwind readnone } + define amdgpu_kernel void @can_move_with_scc([0 x i8] addrspace(6)* %arg) #0 { + bb: + %main.kernarg.segment = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() + %arg.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %main.kernarg.segment, i64 36 + %arg.kernarg.offset.cast = bitcast i8 addrspace(4)* %arg.kernarg.offset to [0 x i8] addrspace(6)* addrspace(4)*, !amdgpu.uniform !0, !amdgpu.noclobber !0 + %arg.load = load [0 x i8] addrspace(6)*, [0 x i8] addrspace(6)* addrspace(4)* %arg.kernarg.offset.cast, align 4, !invariant.load !0 + %tmp15 = bitcast [0 x i8] addrspace(6)* %arg.load to i8 addrspace(6)* + %tmp1 = bitcast i8 addrspace(6)* %tmp15 to <4 x i32> addrspace(6)*, !amdgpu.uniform !0, !amdgpu.noclobber !0 + %tmp2 = load <4 x i32>, <4 x i32> addrspace(6)* %tmp1, align 16, !invariant.load !0 + %tmp3 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp2, i32 0, i32 0) #1 + %tmp4 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp2, i32 4, i32 0) #1 + %tmp5 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp2, i32 8, i32 0) #1 + %tmp6 = bitcast float %tmp3 to i32 + %tmp7 = icmp ne i32 %tmp6, 0 + %tmp8 = bitcast float %tmp4 to i32 + %tmp9 = icmp ne i32 %tmp8, 0 + %tmp10 = bitcast float %tmp5 to i32 + %tmp11 = icmp ne i32 %tmp10, 0 + %tmp12 = and i1 %tmp9, %tmp11 + %tmp13 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp2, i32 12, i32 0) #1 + %tmp14 = select i1 %tmp12, float %tmp13, float 0.000000e+00 + br i1 %tmp7, label %bb15, label %bb16, !amdgpu.uniform !0, !structurizecfg.uniform !0 + + bb15: ; preds = %bb + br label %bb16, !amdgpu.uniform !0, !structurizecfg.uniform !0 + + bb16: ; preds = %bb15, %bb + %tmp17 = phi float [ 0.000000e+00, %bb15 ], [ %tmp14, %bb ] + %tmp18 = fmul float %tmp14, %tmp17 + %tmp19 = bitcast float %tmp18 to i32 + store volatile i32 %tmp19, i32 addrspace(1)* undef + ret void + } + + declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg) #1 + declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #2 + + attributes #0 = { convergent nounwind } + attributes #1 = { convergent nounwind readnone } + attributes #2 = { convergent nounwind readnone } + + !0 = !{} ... --- name: mem_dependency @@ -129,3 +171,51 @@ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit %6, implicit %7 ... +--- +# Make sure SCC is still valid if instructions are moved accros memory +# operations. +# CHECK-LABEL: name: can_move_with_scc +# CHECK: S_AND_B64 +# CHECK: S_CMP_EQ_U32 +name: can_move_with_scc +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32, preferred-register: '' } +body: | + bb.0.bb: + liveins: $sgpr0_sgpr1 + + %3:sgpr_64 = COPY $sgpr0_sgpr1 + %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %3, 9, 0 :: (dereferenceable invariant load 4 from %ir.arg.kernarg.offset.cast, addrspace 4) + %7:sreg_32_xm0 = S_MOV_B32 0 + %8:sreg_64_xexec = REG_SEQUENCE killed %6, %subreg.sub0, %7, %subreg.sub1 + %9:sreg_128 = S_LOAD_DWORDX4_IMM killed %8, 0, 0 :: (invariant load 16 from %ir.tmp1, addrspace 6) + %10:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %9, 0, 0 :: (dereferenceable invariant load 4) + %11:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %9, 1, 0 :: (dereferenceable invariant load 4) + %12:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %9, 2, 0 :: (dereferenceable invariant load 4) + %13:sreg_64 = V_CMP_NE_U32_e64 killed %11, 0, implicit $exec + %15:sreg_64 = V_CMP_NE_U32_e64 killed %12, 0, implicit $exec + %17:sreg_64_xexec = S_AND_B64 killed %13, killed %15, implicit-def dead $scc + S_CMP_EQ_U32 killed %10, 0, implicit-def $scc + %18:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %9, 3, 0 :: (dereferenceable invariant load 4) + %21:vgpr_32 = COPY killed %18 + %0:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %21, killed %17, implicit $exec + S_CBRANCH_SCC1 %bb.2, implicit $scc + S_BRANCH %bb.1 + + bb.1.bb15: + %22:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + + bb.2.bb16: + %1:vgpr_32 = PHI %0, %bb.0, %22, %bb.1 + %23:vgpr_32 = V_MUL_F32_e64 0, %0, 0, %1, 0, 0, implicit $exec + %24:sreg_64 = IMPLICIT_DEF + %25:sreg_32_xm0 = COPY %24.sub1 + %26:sreg_64 = IMPLICIT_DEF + %27:sreg_32_xm0 = COPY %26.sub0 + %28:sreg_32_xm0 = S_MOV_B32 61440 + %29:sreg_32_xm0 = S_MOV_B32 -1 + %30:sreg_128 = REG_SEQUENCE killed %27, %subreg.sub0, killed %25, %subreg.sub1, killed %29, %subreg.sub2, killed %28, %subreg.sub3 + BUFFER_STORE_DWORD_OFFSET killed %23, killed %30, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + S_ENDPGM 0 +...