diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -967,6 +967,13 @@ --Lat; } Dep.setLatency(Lat); + } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) { + // Work around the fact that SIInstrInfo::fixImplicitOperands modifies + // implicit operands which come from the MCInstrDesc, which can fool + // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit + // pseudo operands. + Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency( + DefI, DefOpIdx, UseI, UseOpIdx)); } } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1134,6 +1134,8 @@ } static unsigned getDSShaderTypeValue(const MachineFunction &MF); + + const TargetSchedModel &getSchedModel() const { return SchedModel; } }; /// \brief Returns true if a reg:subreg pair P has a TRC class diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -115,8 +115,8 @@ ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_cmp_eq_u32 s3, 0 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s1 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s2 ; GFX10_W32-NEXT: s_cselect_b32 s3, 1, 0 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s2 ; GFX10_W32-NEXT: s_and_b32 s3, 1, s3 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s3 ; GFX10_W32-NEXT: v_div_fmas_f32 v0, s0, v0, v1 @@ -178,12 +178,12 @@ ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_cmp_eq_u32 s6, 0 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s2 -; GFX10_W32-NEXT: v_mov_b32_e32 v2, s4 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3 -; GFX10_W32-NEXT: v_mov_b32_e32 v3, s5 ; GFX10_W32-NEXT: s_cselect_b32 s6, 1, 0 +; GFX10_W32-NEXT: v_mov_b32_e32 v2, s4 ; GFX10_W32-NEXT: s_and_b32 s6, 1, s6 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6 +; GFX10_W32-NEXT: v_mov_b32_e32 v3, s5 ; GFX10_W32-NEXT: v_div_fmas_f64 v[0:1], s[0:1], v[0:1], v[2:3] ; GFX10_W32-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10_W32-NEXT: v_readfirstlane_b32 s1, v1 @@ -260,8 +260,8 @@ ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W32-NEXT: s_and_b32 s0, 1, s4 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6 ; GFX10_W32-NEXT: v_div_fmas_f32 v0, s7, v0, v1 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] @@ -558,9 +558,9 @@ ; GFX10_W32-NEXT: s_and_b32 s0, 1, s2 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s8 ; GFX10_W32-NEXT: v_mov_b32_e32 v2, s10 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, s9 ; GFX10_W32-NEXT: v_mov_b32_e32 v3, s11 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10_W32-NEXT: v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3] ; GFX10_W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10_W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] @@ -634,8 +634,8 @@ ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W32-NEXT: s_cmp_eq_u32 s7, 0 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6 ; GFX10_W32-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6 ; GFX10_W32-NEXT: s_and_b32 s0, 1, s0 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10_W32-NEXT: v_div_fmas_f32 v0, s4, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll --- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll @@ -83,8 +83,8 @@ ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0x7b ; GFX10-NEXT: s_mov_b32 vcc_lo, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0x7b ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -332,9 +332,9 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_mov_b32 vcc_lo, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x7b ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: s_mov_b32 vcc_lo, 0 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0x3fb, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0