diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -582,6 +582,11 @@ return getGeneration() <= SEA_ISLANDS; } + /// Writes to VCC_LO/VCC_HI update the VCCZ flag. + bool partialVCCWritesUpdateVCCZ() const { + return getGeneration() >= GFX10; + } + /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR /// was written by a VALU instruction. bool hasSMRDReadVALUDefHazard() const { diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1367,6 +1367,10 @@ ScoreBrackets.dump(); }); + // Assume VCCZ is correct at basic block boundaries, unless and until we need + // to handle cases where that is not true. + bool VCCZCorrect = true; + // Walk over the instructions. MachineInstr *OldWaitcntInstr = nullptr; @@ -1386,13 +1390,32 @@ continue; } - bool VCCZBugWorkAround = false; + bool RestoreVCCZ = false; if (readsVCCZ(Inst)) { - if (ScoreBrackets.getScoreLB(LGKM_CNT) < - ScoreBrackets.getScoreUB(LGKM_CNT) && - ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { - if (ST->hasReadVCCZBug()) - VCCZBugWorkAround = true; + if (!VCCZCorrect) + RestoreVCCZ = true; + else if (ST->hasReadVCCZBug()) { + if (ScoreBrackets.getScoreLB(LGKM_CNT) < + ScoreBrackets.getScoreUB(LGKM_CNT) && + ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { + RestoreVCCZ = true; + } + } + } + + if (!ST->partialVCCWritesUpdateVCCZ()) { + // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz. + // Writes to vcc will fix it. Only examine explicit defs. + for (auto &Op : Inst.defs()) { + switch (Op.getReg()) { + case AMDGPU::VCC: + VCCZCorrect = true; + break; + case AMDGPU::VCC_LO: + case AMDGPU::VCC_HI: + VCCZCorrect = false; + break; + } } } @@ -1421,7 +1444,7 @@ // TODO: Remove this work-around after fixing the scheduler and enable the // assert above. - if (VCCZBugWorkAround) { + if (RestoreVCCZ) { // Restore the vccz bit. Any time a value is written to vcc, the vcc // bit is updated, so we can restore the bit by reading the value of // vcc and then writing it back to the register. @@ -1429,6 +1452,7 @@ TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), TRI->getVCC()) .addReg(TRI->getVCC()); + VCCZCorrect = true; Modified = true; } diff --git a/llvm/test/CodeGen/AMDGPU/reload-vcc-vccz.mir b/llvm/test/CodeGen/AMDGPU/reload-vcc-vccz.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/reload-vcc-vccz.mir @@ -0,0 +1,48 @@ +# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck %s --check-prefixes=GCN,GFX9 +# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck %s --check-prefixes=GCN + +# Test that after reloading vcc spilled to a vgpr, we insert any necessary +# instructions to fix vccz. + +--- + +name: reload_vcc_from_vgpr +body: | + ; GCN-LABEL: name: reload_vcc_from_vgpr + ; GCN: $vcc_lo = V_READLANE_B32_vi $vgpr0, 8, implicit-def $vcc + ; GCN: $vcc_hi = V_READLANE_B32_vi $vgpr0, 9 + ; GFX9: $vcc = S_MOV_B64 $vcc + ; GCN-NOT: $vcc = S_MOV_B64 $vcc + ; GCN: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + bb.0: + $vcc_lo = V_READLANE_B32_vi $vgpr0, 8, implicit-def $vcc + $vcc_hi = V_READLANE_B32_vi $vgpr0, 9 + S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + bb.1: + +... + +# Test that after reloading vcc spilled to memory, we insert any necessary +# instructions to fix vccz. + +--- + +name: reload_vcc_from_mem +body: | + ; GCN-LABEL: name: reload_vcc_from_mem + ; GCN: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vcc_lo = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc + ; GCN: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vcc_hi = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc + ; GFX9: $vcc = S_MOV_B64 $vcc + ; GCN-NOT: $vcc = S_MOV_B64 $vcc + ; GCN: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + bb.0: + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, implicit $exec + $vcc_lo = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, implicit $exec + $vcc_hi = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc + S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + bb.1: + +...