Index: lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -281,6 +281,7 @@ // registers are in SSA form. if (Use.isReg() && ((Use.readsReg() && RegDefs.count(Use.getReg())) || + (Use.isDef() && RegDefs.count(Use.getReg())) || (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) && PhysRegUses.count(Use.getReg())))) { Insts.push_back(&MI); Index: test/CodeGen/AMDGPU/merge-load-store.mir =================================================================== --- test/CodeGen/AMDGPU/merge-load-store.mir +++ test/CodeGen/AMDGPU/merge-load-store.mir @@ -59,6 +59,30 @@ attributes #0 = { convergent nounwind } attributes #1 = { convergent nounwind readnone } + define amdgpu_kernel void @move_waw_hazards([0 x i8] addrspace(6)* %arg) #0 { + bb: + %tmp15 = bitcast [0 x i8] addrspace(6)* %arg to i8 addrspace(6)* + %tmp1 = bitcast i8 addrspace(6)* %tmp15 to <4 x i32> addrspace(6)* + %tmp2 = load <4 x i32>, <4 x i32> addrspace(6)* %tmp1, align 16 + %tmp3 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp2, i32 0, i32 0) #1 + %tmp4 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp2, i32 4, i32 0) #1 + %tmp5 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp2, i32 8, i32 0) #1 + %tmp6 = bitcast float %tmp3 to i32 + %tmp7 = icmp ne i32 %tmp6, 0 + %tmp8 = bitcast float %tmp4 to i32 + %tmp9 = icmp ne i32 %tmp8, 0 + %tmp10 = bitcast float %tmp5 to i32 + %tmp11 = icmp ne i32 %tmp10, 0 + %tmp12 = and i1 %tmp9, %tmp11 + %tmp13 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp2, i32 12, i32 0) #1 + %tmp14 = select i1 %tmp12, float %tmp13, float 0.000000e+00 + ret void + } + + declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg) #1 + + attributes #0 = { convergent nounwind } + attributes #1 = { convergent nounwind readnone } ... --- name: mem_dependency @@ -129,3 +153,32 @@ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit %6, implicit %7 ... +--- +# Make sure Write-after-Write hazards are correctly detected and the +# instructions moved accordingly. +# operations. +# CHECK-LABEL: name: move_waw_hazards +# CHECK: S_AND_B64 +# CHECK: S_CMP_EQ_U32 +name: move_waw_hazards +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + %3:sgpr_64 = COPY $sgpr0_sgpr1 + %6:sreg_32_xm0_xexec = S_MOV_B32 0 + %7:sreg_32_xm0 = S_MOV_B32 0 + %8:sreg_64_xexec = REG_SEQUENCE killed %6, %subreg.sub0, %7, %subreg.sub1 + %9:sreg_128 = S_LOAD_DWORDX4_IMM killed %8, 0, 0, 0 :: (invariant load 16 from %ir.tmp1, addrspace 6) + %31:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_IMM %9, 0, 0, 0 :: (dereferenceable invariant load 4) + %10:sreg_32_xm0_xexec = COPY %31.sub0 + %11:sreg_32_xm0_xexec = COPY killed %31.sub1 + %12:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %9, 2, 0, 0 :: (dereferenceable invariant load 4) + %13:sreg_64 = V_CMP_NE_U32_e64 killed %11, 0, implicit $exec + %15:sreg_64 = V_CMP_NE_U32_e64 killed %12, 0, implicit $exec + %17:sreg_64_xexec = S_AND_B64 killed %13, killed %15, implicit-def dead $scc + S_CMP_EQ_U32 killed %10, 0, implicit-def $scc + %18:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %9, 3, 0, 0 :: (dereferenceable invariant load 4) + S_ENDPGM 0 +...