diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1428,9 +1428,19 @@ ScoreBrackets.dump(); }); - // Assume VCCZ is correct at basic block boundaries, unless and until we need - // to handle cases where that is not true. + // Track the correctness of vccz through this basic block. There are two + // reasons why it might be incorrect; see ST->hasReadVCCZBug() and + // ST->partialVCCWritesUpdateVCCZ(). bool VCCZCorrect = true; + if (ST->hasReadVCCZBug()) { + // vccz could be incorrect at a basic block boundary if a predecessor wrote + // to vcc and then issued an smem load. + VCCZCorrect = false; + } else if (!ST->partialVCCWritesUpdateVCCZ()) { + // vccz could be incorrect at a basic block boundary if a predecessor wrote + // to vcc_lo or vcc_hi. + VCCZCorrect = false; + } // Walk over the instructions. MachineInstr *OldWaitcntInstr = nullptr; @@ -1451,15 +1461,21 @@ continue; } - // We might need to restore vccz to its correct value for either of two - // different reasons; see ST->hasReadVCCZBug() and - // ST->partialVCCWritesUpdateVCCZ(). - bool RestoreVCCZ = false; - if (readsVCCZ(Inst)) { - if (!VCCZCorrect) { - // Restore vccz if it's not known to be correct already. - RestoreVCCZ = true; - } else if (ST->hasReadVCCZBug()) { + // Generate an s_waitcnt instruction to be placed before Inst, if needed. + Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr); + OldWaitcntInstr = nullptr; + + // Restore vccz if it's not known to be correct already. + bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst); + + // Don't examine operands unless we need to track vccz correctness. + if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) { + if (Inst.definesRegister(AMDGPU::VCC_LO) || + Inst.definesRegister(AMDGPU::VCC_HI)) { + // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz. + if (!ST->partialVCCWritesUpdateVCCZ()) + VCCZCorrect = false; + } else if (Inst.definesRegister(AMDGPU::VCC)) { // There is a hardware bug on CI/SI where SMRD instruction may corrupt // vccz bit, so when we detect that an instruction may read from a // corrupt vccz bit, we need to: @@ -1467,12 +1483,16 @@ // operations to complete. // 2. Restore the correct value of vccz by writing the current value // of vcc back to vcc. - if (ScoreBrackets.getScoreLB(LGKM_CNT) < - ScoreBrackets.getScoreUB(LGKM_CNT) && + if (ST->hasReadVCCZBug() && + ScoreBrackets.getScoreLB(LGKM_CNT) < + ScoreBrackets.getScoreUB(LGKM_CNT) && ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { - // Restore vccz if there's an outstanding smem read, which could - // complete and clobber vccz at any time. - RestoreVCCZ = true; + // Writes to vcc while there's an outstanding smem read may get + // clobbered as soon as any read completes. + VCCZCorrect = false; + } else { + // Writes to vcc will fix any incorrect value in vccz. + VCCZCorrect = true; } } } @@ -1482,24 +1502,12 @@ const Value *Ptr = Memop->getValue(); SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent())); } - } - - if (!ST->partialVCCWritesUpdateVCCZ()) { - if (Inst.definesRegister(AMDGPU::VCC_LO) || - Inst.definesRegister(AMDGPU::VCC_HI)) { - // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz. + if (ST->hasReadVCCZBug()) { + // This smem read could complete and clobber vccz at any time. VCCZCorrect = false; - } else if (Inst.definesRegister(AMDGPU::VCC)) { - // Writes to vcc will fix any incorrect value in vccz. - VCCZCorrect = true; } } - // Generate an s_waitcnt instruction to be placed before - // cur_Inst, if needed. - Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr); - OldWaitcntInstr = nullptr; - updateEventWaitcntAfter(Inst, &ScoreBrackets); #if 0 // TODO: implement resource type check controlled by options with ub = LB. diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll --- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll @@ -45,6 +45,7 @@ ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b64 vcc, vcc ; SI-NEXT: s_cbranch_vccnz BB1_2 ; SI-NEXT: BB1_3: ; %UnifiedReturnBlock ; SI-NEXT: s_endpgm @@ -86,6 +87,7 @@ ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b64 vcc, vcc ; SI-NEXT: s_cbranch_vccnz BB2_2 ; SI-NEXT: ; %bb.3: ; %Flow ; SI-NEXT: s_mov_b64 s[2:3], 0 @@ -103,6 +105,7 @@ ; SI-NEXT: BB2_6: ; %loop1 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b64 vcc, vcc ; SI-NEXT: s_cbranch_vccz BB2_6 ; SI-NEXT: BB2_7: ; %DummyReturnBlock ; SI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll --- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll @@ -42,6 +42,7 @@ ; GCN-NEXT: s_and_b64 vcc, exec, 0 ; GCN-NEXT: BB0_6: ; %bb9 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: s_mov_b64 vcc, vcc ; GCN-NEXT: s_cbranch_vccz BB0_6 ; GCN-NEXT: BB0_7: ; %DummyReturnBlock ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -227,6 +227,7 @@ ; SI-NEXT: s_and_b64 vcc, exec, 0 ; SI-NEXT: BB3_12: ; %self.loop ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_mov_b64 vcc, vcc ; SI-NEXT: s_cbranch_vccz BB3_12 ; SI-NEXT: BB3_13: ; %DummyReturnBlock ; SI-NEXT: s_endpgm @@ -298,6 +299,7 @@ ; FLAT-NEXT: s_and_b64 vcc, exec, 0 ; FLAT-NEXT: BB3_12: ; %self.loop ; FLAT-NEXT: ; =>This Inner Loop Header: Depth=1 +; FLAT-NEXT: s_mov_b64 vcc, vcc ; FLAT-NEXT: s_cbranch_vccz BB3_12 ; FLAT-NEXT: BB3_13: ; %DummyReturnBlock ; FLAT-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll @@ -15,6 +15,7 @@ ; CHECK-NEXT: s_and_b64 vcc, exec, 0 ; CHECK-NEXT: BB0_2: ; %bb10 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_mov_b64 vcc, vcc ; CHECK-NEXT: s_cbranch_vccz BB0_2 ; CHECK-NEXT: BB0_3: ; %DummyReturnBlock ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir --- a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir +++ b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir @@ -168,6 +168,8 @@ # CHECK-LABEL: name: vcc_def_pred # CHECK: bb.1: +# SI: $vcc = S_MOV_B64 $vcc +# GFX9: $vcc = S_MOV_B64 $vcc # CHECK: S_CBRANCH_VCCZ %bb.2, implicit $vcc name: vcc_def_pred @@ -243,7 +245,7 @@ # SI-NEXT: $vcc = S_MOV_B64 0 # SI-NEXT: S_WAITCNT 127 # SI-NEXT: S_NOP 0 -# FIXME should have $vcc = S_MOV_B64 $vcc +# SI-NEXT: $vcc = S_MOV_B64 $vcc # SI-NEXT: S_CBRANCH_VCCZ %bb.1, implicit $vcc name: load_def_wait_nop_use body: | @@ -298,7 +300,7 @@ # SI-NEXT: $sgpr0 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0 # SI-NEXT: S_WAITCNT 127 # SI-NEXT: S_NOP 0 -# FIXME should have $vcc = S_MOV_B64 $vcc +# SI-NEXT: $vcc = S_MOV_B64 $vcc # SI-NEXT: S_CBRANCH_VCCZ %bb.1, implicit $vcc name: def_load_wait_nop_use body: |