diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -587,6 +587,11 @@ return getGeneration() <= SEA_ISLANDS; } + /// Writes to VCC_LO/VCC_HI update the VCCZ flag. + bool partialVCCWritesUpdateVCCZ() const { + return getGeneration() >= GFX10; + } + /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR /// was written by a VALU instruction. bool hasSMRDReadVALUDefHazard() const { diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1383,6 +1383,10 @@ ScoreBrackets.dump(); }); + // Assume VCCZ is correct at basic block boundaries, unless and until we need + // to handle cases where that is not true. + bool VCCZCorrect = true; + // Walk over the instructions. MachineInstr *OldWaitcntInstr = nullptr; @@ -1402,13 +1406,26 @@ continue; } - bool VCCZBugWorkAround = false; + // We might need to restore vccz to its correct value for either of two + // different reasons; see ST->hasReadVCCZBug() and + // ST->partialVCCWritesUpdateVCCZ(). + bool RestoreVCCZ = false; if (readsVCCZ(Inst)) { - if (ScoreBrackets.getScoreLB(LGKM_CNT) < - ScoreBrackets.getScoreUB(LGKM_CNT) && - ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { - if (ST->hasReadVCCZBug()) - VCCZBugWorkAround = true; + if (!VCCZCorrect) + RestoreVCCZ = true; + else if (ST->hasReadVCCZBug()) { + // There is a hardware bug on CI/SI where SMRD instruction may corrupt + // vccz bit, so when we detect that an instruction may read from a + // corrupt vccz bit, we need to: + // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD + // operations to complete. + // 2. Restore the correct value of vccz by writing the current value + // of vcc back to vcc. + if (ScoreBrackets.getScoreLB(LGKM_CNT) < + ScoreBrackets.getScoreUB(LGKM_CNT) && + ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { + RestoreVCCZ = true; + } } } @@ -1419,6 +1436,16 @@ } } + if (!ST->partialVCCWritesUpdateVCCZ()) { + // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz. + // Writes to vcc will fix it. + if (Inst.definesRegister(AMDGPU::VCC_LO) || + Inst.definesRegister(AMDGPU::VCC_HI)) + VCCZCorrect = false; + else if (Inst.definesRegister(AMDGPU::VCC)) + VCCZCorrect = true; + } + // Generate an s_waitcnt instruction to be placed before // cur_Inst, if needed. Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr); @@ -1444,7 +1471,7 @@ // TODO: Remove this work-around after fixing the scheduler and enable the // assert above. - if (VCCZBugWorkAround) { + if (RestoreVCCZ) { // Restore the vccz bit. Any time a value is written to vcc, the vcc // bit is updated, so we can restore the bit by reading the value of // vcc and then writing it back to the register. @@ -1452,6 +1479,7 @@ TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), TRI->getVCC()) .addReg(TRI->getVCC()); + VCCZCorrect = true; Modified = true; } diff --git a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir --- a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir +++ b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir @@ -85,3 +85,81 @@ S_ENDPGM 0 ... +--- +# Test that after reloading vcc spilled to a vgpr, we insert any necessary +# instructions to fix vccz. + +# CHECK-LABEL: name: reload_vcc_from_vgpr +# CHECK: $vcc_lo = V_READLANE_B32_vi $vgpr0, 8, implicit-def $vcc +# CHECK: $vcc_hi = V_READLANE_B32_vi $vgpr0, 9 +# SI: $vcc = S_MOV_B64 $vcc +# GFX9: $vcc = S_MOV_B64 $vcc +# CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + +name: reload_vcc_from_vgpr +body: | + bb.0: + $vcc_lo = V_READLANE_B32_vi $vgpr0, 8, implicit-def $vcc + $vcc_hi = V_READLANE_B32_vi $vgpr0, 9 + S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + bb.1: + +... +--- +# Test that after reloading vcc spilled to memory, we insert any necessary +# instructions to fix vccz. + +# CHECK-LABEL: name: reload_vcc_from_mem +# CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, implicit $exec +# CHECK: $vcc_lo = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc +# CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, implicit $exec +# CHECK: $vcc_hi = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc +# SI: $vcc = S_MOV_B64 $vcc +# GFX9: $vcc = S_MOV_B64 $vcc +# CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + +name: reload_vcc_from_mem +body: | + bb.0: + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, implicit $exec + $vcc_lo = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, implicit $exec + $vcc_hi = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc + S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + bb.1: + +... +--- +# Test that after inline asm that defines vcc_lo, we insert any necessary +# instructions to fix vccz. + +# CHECK-LABEL: name: inlineasm_def_vcc_lo +# CHECK: INLINEASM &"; def vcc_lo", 1, 10, implicit-def $vcc_lo +# SI: $vcc = S_MOV_B64 $vcc +# GFX9: $vcc = S_MOV_B64 $vcc +# CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + +name: inlineasm_def_vcc_lo +body: | + bb.0: + INLINEASM &"; def vcc_lo", 1, 10, implicit-def $vcc_lo + S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + bb.1: + +... +--- +# Test that after inline asm that defines vcc, no unnecessary instructions are +# inserted to fix vccz. + +# CHECK-LABEL: name: inlineasm_def_vcc +# CHECK: INLINEASM &"; def vcc", 1, 10, implicit-def $vcc +# CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + +name: inlineasm_def_vcc +body: | + bb.0: + INLINEASM &"; def vcc", 1, 10, implicit-def $vcc + S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + bb.1: + +...