diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -106,6 +106,7 @@ bool fixVALUTransUseHazard(MachineInstr *MI); bool fixWMMAHazards(MachineInstr *MI); bool fixShift64HighRegBug(MachineInstr *MI); + bool fixVALUMaskWriteHazard(MachineInstr *MI); int checkMAIHazards(MachineInstr *MI); int checkMAIHazards908(MachineInstr *MI); diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -1102,6 +1102,7 @@ fixVALUTransUseHazard(MI); fixWMMAHazards(MI); fixShift64HighRegBug(MI); + fixVALUMaskWriteHazard(MI); } bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { @@ -2709,3 +2710,140 @@ return false; } + +bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { + if (!ST.isWave64()) + return false; + if (!ST.hasVALUMaskWriteHazard()) + return false; + if (!SIInstrInfo::isSALU(*MI)) + return false; + + // The hazard sequence is three instructions: + // 1. VALU reads SGPR as mask + // 2. SALU writes SGPR + // 3. SALU reads SGPR + // The hazard can expire if the distance between 2 and 3 is sufficient. + // In practice this happens <10% of the time, hence this always assumes + // the hazard exists if 1 and 2 are present to avoid searching. + + const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst); + if (!SDSTOp || !SDSTOp->isReg()) + return false; + + const Register HazardReg = SDSTOp->getReg(); + if (HazardReg == AMDGPU::EXEC || + HazardReg == AMDGPU::EXEC_LO || + HazardReg == AMDGPU::EXEC_HI || + HazardReg == AMDGPU::M0) + return false; + + auto IsHazardFn = [HazardReg, this](const MachineInstr &I) { + switch (I.getOpcode()) { + case AMDGPU::V_ADDC_U32_e32: + case AMDGPU::V_ADDC_U32_dpp: + case AMDGPU::V_CNDMASK_B16_e32: + case AMDGPU::V_CNDMASK_B16_dpp: + case AMDGPU::V_CNDMASK_B32_e32: + case AMDGPU::V_CNDMASK_B32_dpp: + case AMDGPU::V_DIV_FMAS_F32_e64: + case AMDGPU::V_DIV_FMAS_F64_e64: + case AMDGPU::V_SUBB_U32_e32: + case AMDGPU::V_SUBB_U32_dpp: + case AMDGPU::V_SUBBREV_U32_e32: + case AMDGPU::V_SUBBREV_U32_dpp: + // These implicitly read VCC as mask source. + return HazardReg == AMDGPU::VCC || + HazardReg == AMDGPU::VCC_LO || + HazardReg == AMDGPU::VCC_HI; + case AMDGPU::V_ADDC_U32_e64: + case AMDGPU::V_ADDC_U32_e64_dpp: + case AMDGPU::V_CNDMASK_B16_e64: + case AMDGPU::V_CNDMASK_B16_e64_dpp: + case AMDGPU::V_CNDMASK_B32_e64: + case AMDGPU::V_CNDMASK_B32_e64_dpp: + case AMDGPU::V_SUBB_U32_e64: + case AMDGPU::V_SUBB_U32_e64_dpp: + case AMDGPU::V_SUBBREV_U32_e64: + case AMDGPU::V_SUBBREV_U32_e64_dpp: { + // Only check mask register overlaps. + const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2); + assert(SSRCOp); + return TRI.regsOverlap(SSRCOp->getReg(), HazardReg); + } + default: + return false; + } + }; + + const MachineRegisterInfo &MRI = MF.getRegInfo(); + auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) { + // s_waitcnt_depctr sa_sdst(0) mitigates hazard. + if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && + !(I.getOperand(0).getImm() & 0x1)) + return true; + + // VALU access to any SGPR or literal constant other than HazardReg + // mitigates hazard. No need to check HazardReg here as this will + // only be called when !IsHazardFn. + if (!SIInstrInfo::isVALU(I)) + return false; + for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) { + const MachineOperand &Op = I.getOperand(OpNo); + if (Op.isReg()) { + Register OpReg = Op.getReg(); + // Only consider uses + if (!Op.isUse()) + continue; + // Ignore EXEC + if (OpReg == AMDGPU::EXEC || + OpReg == AMDGPU::EXEC_LO || + OpReg == AMDGPU::EXEC_HI) + continue; + // Ignore all implicit uses except VCC + if (Op.isImplicit()) { + if (OpReg == AMDGPU::VCC || + OpReg == AMDGPU::VCC_LO || + OpReg == AMDGPU::VCC_HI) + return true; + continue; + } + if (TRI.isSGPRReg(MRI, OpReg)) + return true; + } else { + const MCInstrDesc &InstDesc = I.getDesc(); + const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo]; + if (TII.isLiteralConstant(Op, OpInfo)) + return true; + } + } + return false; + }; + + // Check for hazard + if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == + std::numeric_limits::max()) + return false; + + auto NextMI = std::next(MI->getIterator()); + + // Add s_waitcnt_depctr sa_sdst(0) after SALU write. + BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(), + TII.get(AMDGPU::S_WAITCNT_DEPCTR)) + .addImm(0xfffe); + + // SALU write may be s_getpc in a bundle. + if (MI->getOpcode() == AMDGPU::S_GETPC_B64) { + // Update offsets of any references in the bundle. + while (NextMI != MI->getParent()->end() && + NextMI->isBundledWithPred()) { + for (auto &Operand : NextMI->operands()) { + if (Operand.isGlobal()) + Operand.setOffset(Operand.getOffset() + 4); + } + NextMI++; + } + } + + return true; +} diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1058,6 +1058,8 @@ bool hasVALUTransUseHazard() const { return getGeneration() >= GFX11; } + bool hasVALUMaskWriteHazard() const { return getGeneration() >= GFX11; } + /// Return if operations acting on VGPR tuples require even alignment. bool needsAlignedVGPRs() const { return GFX90AInsts; } diff --git a/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir @@ -0,0 +1,560 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s + +--- | + @mem = internal unnamed_addr addrspace(4) constant [4 x <4 x i32>] [<4 x i32> , <4 x i32> , <4 x i32> , <4 x i32> ] + + define amdgpu_gs void @mask_hazard_getpc1() { ret void } + define amdgpu_gs void @mask_hazard_getpc2() { ret void } + define amdgpu_gs void @mask_hazard_vcc1() { ret void } + define amdgpu_gs void @mask_hazard_vcc2() { ret void } + define amdgpu_gs void @mask_hazard_cndmask_dpp1() { ret void } + define amdgpu_gs void @mask_hazard_cndmask_dpp2() { ret void } + define amdgpu_gs void @mask_hazard_cndmask_dpp3() { ret void } + define amdgpu_gs void @mask_hazard_cndmask_dpp4() { ret void } + define amdgpu_gs void @mask_hazard_addc1() { ret void } + define amdgpu_gs void @mask_hazard_addc2() { ret void } + define amdgpu_gs void @mask_hazard_addc3() { ret void } + define amdgpu_gs void @mask_hazard_addc4() { ret void } + define amdgpu_gs void @mask_hazard_subb1() { ret void } + define amdgpu_gs void @mask_hazard_subb2() { ret void } + define amdgpu_gs void @mask_hazard_subb3() { ret void } + define amdgpu_gs void @mask_hazard_subb4() { ret void } + define amdgpu_gs void @mask_hazard_subbrev1() { ret void } + define amdgpu_gs void @mask_hazard_subbrev2() { ret void } + define amdgpu_gs void @mask_hazard_subbrev3() { ret void } + define amdgpu_gs void @mask_hazard_subbrev4() { ret void } + define amdgpu_gs void @mask_hazard_div_fmas_f32() { ret void } + define amdgpu_gs void @mask_hazard_div_fmas_f64() { ret void } + define amdgpu_gs void @mask_hazard_subreg1() { ret void } + define amdgpu_gs void @mask_hazard_subreg2() { ret void } + define amdgpu_gs void @mask_hazard_subreg3() { ret void } + define amdgpu_gs void @mask_hazard_subreg4() { ret void } + define amdgpu_gs void @mask_hazard_subreg5() { ret void } + define amdgpu_gs void @mask_hazard_waitcnt() { ret void } + define amdgpu_gs void @mask_hazard_gap1() { ret void } + define amdgpu_gs void @mask_hazard_gap2() { ret void } + define amdgpu_gs void @mask_hazard_gap3() { ret void } + define amdgpu_gs void @mask_hazard_no_hazard1() { ret void } + define amdgpu_gs void @mask_hazard_no_hazard2() { ret void } + define amdgpu_gs void @mask_hazard_no_hazard3() { ret void } +... + +--- +name: mask_hazard_getpc1 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_getpc1 + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +--- +name: mask_hazard_getpc2 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_getpc2 + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 { + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 8, implicit-def $scc + ; GCN-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-lo) @mem + 16, implicit-def $scc, implicit $scc + ; GCN-NEXT: } + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + BUNDLE implicit-def $sgpr0_sgpr1 { + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 4, implicit-def $scc + $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-lo) @mem + 12, implicit-def $scc, implicit $scc + } + S_ENDPGM 0 +... + +--- +name: mask_hazard_vcc1 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_vcc1 + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GCN-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc + S_ENDPGM 0 +... + +--- +name: mask_hazard_vcc2 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_vcc2 + ; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + ; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + $vcc = S_CSELECT_B64 -1, 0, implicit $scc + S_ENDPGM 0 +... + +--- +name: mask_hazard_cndmask_dpp1 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_cndmask_dpp1 + ; GCN: $vgpr0 = V_CNDMASK_B32_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, 1, 15, 15, 1, implicit $vcc, implicit $exec + ; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: S_ENDPGM 0 + $vgpr0 = V_CNDMASK_B32_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, 1, 15, 15, 1, implicit $vcc, implicit $exec + $vcc = S_CSELECT_B64 -1, 0, implicit $scc + S_ENDPGM 0 +... + +--- +name: mask_hazard_cndmask_dpp2 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_cndmask_dpp2 + ; GCN: $vgpr0 = V_CNDMASK_B32_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec + ; GCN-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: S_ENDPGM 0 + $vgpr0 = V_CNDMASK_B32_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec + $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc + S_ENDPGM 0 +... + +--- +name: mask_hazard_cndmask_dpp3 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_cndmask_dpp3 + ; GCN: $vgpr0 = V_CNDMASK_B16_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, 1, 15, 15, 1, implicit $vcc, implicit $exec + ; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: S_ENDPGM 0 + $vgpr0 = V_CNDMASK_B16_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, 1, 15, 15, 1, implicit $vcc, implicit $exec + $vcc = S_CSELECT_B64 -1, 0, implicit $scc + S_ENDPGM 0 +... + +--- +name: mask_hazard_cndmask_dpp4 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_cndmask_dpp4 + ; GCN: $vgpr0 = V_CNDMASK_B16_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec + ; GCN-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: S_ENDPGM 0 + $vgpr0 = V_CNDMASK_B16_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec + $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc + S_ENDPGM 0 +... + +--- +name: mask_hazard_addc1 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_addc1 + ; GCN: $vgpr1, $vcc = V_ADDC_U32_e64 0, $vgpr1, $sgpr2_sgpr3, 0, implicit $exec + ; GCN-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $vcc = V_ADDC_U32_e64 0, $vgpr1, $sgpr2_sgpr3, 0, implicit $exec + $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc + S_ENDPGM 0 +... + +--- +name: mask_hazard_addc2 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_addc2 + ; GCN: $vgpr1 = V_ADDC_U32_e32 0, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec + ; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_ADDC_U32_e32 0, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec + $vcc = S_CSELECT_B64 -1, 0, implicit $scc + S_ENDPGM 0 +... + +--- +name: mask_hazard_addc3 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_addc3 + ; GCN: $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr1, $vgpr2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec + ; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: S_ENDPGM 0 + $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr1, $vgpr2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec + $vcc = S_CSELECT_B64 -1, 0, implicit $scc + S_ENDPGM 0 +... + +--- +name: mask_hazard_addc4 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_addc4 + ; GCN: $vgpr0, $sgpr2_sgpr3 = V_ADDC_U32_e64_dpp $vgpr0, $vgpr1, $vgpr2, $sgpr2_sgpr3, 0, 1, 15, 15, 1, implicit $exec + ; GCN-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: S_ENDPGM 0 + $vgpr0, $sgpr2_sgpr3 = V_ADDC_U32_e64_dpp $vgpr0, $vgpr1, $vgpr2, $sgpr2_sgpr3, 0, 1, 15, 15, 1, implicit $exec + $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc + S_ENDPGM 0 +... + +--- +name: mask_hazard_subb1 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_subb1 + ; GCN: $vgpr1, $vcc = V_SUBB_U32_e64 0, $vgpr1, $sgpr2_sgpr3, 0, implicit $exec + ; GCN-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $vcc = V_SUBB_U32_e64 0, $vgpr1, $sgpr2_sgpr3, 0, implicit $exec + $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc + S_ENDPGM 0 +... + +--- +name: mask_hazard_subb2 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_subb2 + ; GCN: $vgpr1 = V_SUBB_U32_e32 0, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec + ; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_SUBB_U32_e32 0, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec + $vcc = S_CSELECT_B64 -1, 0, implicit $scc + S_ENDPGM 0 +... + +--- +name: mask_hazard_subb3 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_subb3 + ; GCN: $vgpr0 = V_SUBB_U32_dpp $vgpr0, $vgpr1, $vgpr2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec + ; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: S_ENDPGM 0 + $vgpr0 = V_SUBB_U32_dpp $vgpr0, $vgpr1, $vgpr2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec + $vcc = S_CSELECT_B64 -1, 0, implicit $scc + S_ENDPGM 0 +... + +--- +name: mask_hazard_subb4 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_subb4 + ; GCN: $vgpr0, $sgpr2_sgpr3 = V_SUBB_U32_e64_dpp $vgpr0, $vgpr1, $vgpr2, $sgpr2_sgpr3, 0, 1, 15, 15, 1, implicit $exec + ; GCN-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: S_ENDPGM 0 + $vgpr0, $sgpr2_sgpr3 = V_SUBB_U32_e64_dpp $vgpr0, $vgpr1, $vgpr2, $sgpr2_sgpr3, 0, 1, 15, 15, 1, implicit $exec + $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc + S_ENDPGM 0 +... + +--- +name: mask_hazard_subbrev1 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_subbrev1 + ; GCN: $vgpr1, $vcc = V_SUBBREV_U32_e64 0, $vgpr1, $sgpr2_sgpr3, 0, implicit $exec + ; GCN-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1, $vcc = V_SUBBREV_U32_e64 0, $vgpr1, $sgpr2_sgpr3, 0, implicit $exec + $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc + S_ENDPGM 0 +... + +--- +name: mask_hazard_subbrev2 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_subbrev2 + ; GCN: $vgpr1 = V_SUBBREV_U32_e32 0, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec + ; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_SUBBREV_U32_e32 0, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec + $vcc = S_CSELECT_B64 -1, 0, implicit $scc + S_ENDPGM 0 +... + +--- +name: mask_hazard_subbrev3 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_subbrev3 + ; GCN: $vgpr0 = V_SUBBREV_U32_dpp $vgpr0, $vgpr1, $vgpr2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec + ; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: S_ENDPGM 0 + $vgpr0 = V_SUBBREV_U32_dpp $vgpr0, $vgpr1, $vgpr2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec + $vcc = S_CSELECT_B64 -1, 0, implicit $scc + S_ENDPGM 0 +... + +--- +name: mask_hazard_subbrev4 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_subbrev4 + ; GCN: $vgpr0, $sgpr2_sgpr3 = V_SUBBREV_U32_e64_dpp $vgpr0, $vgpr1, $vgpr2, $sgpr2_sgpr3, 0, 1, 15, 15, 1, implicit $exec + ; GCN-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: S_ENDPGM 0 + $vgpr0, $sgpr2_sgpr3 = V_SUBBREV_U32_e64_dpp $vgpr0, $vgpr1, $vgpr2, $sgpr2_sgpr3, 0, 1, 15, 15, 1, implicit $exec + $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc + S_ENDPGM 0 +... + +--- +name: mask_hazard_div_fmas_f32 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_div_fmas_f32 + ; GCN: $vgpr0 = V_DIV_FMAS_F32_e64 0, $vgpr1, 0, $vgpr2, 0, $vgpr3, 0, 0, implicit $mode, implicit $vcc, implicit $exec + ; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: S_ENDPGM 0 + $vgpr0 = V_DIV_FMAS_F32_e64 0, $vgpr1, 0, $vgpr2, 0, $vgpr3, 0, 0, implicit $mode, implicit $vcc, implicit $exec + $vcc = S_CSELECT_B64 -1, 0, implicit $scc + S_ENDPGM 0 +... + +--- +name: mask_hazard_div_fmas_f64 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_div_fmas_f64 + ; GCN: $vgpr0_vgpr1 = V_DIV_FMAS_F64_e64 0, $vgpr0_vgpr1, 0, $vgpr2_vgpr3, 0, $vgpr4_vgpr5, 0, 0, implicit $mode, implicit $vcc, implicit $exec + ; GCN-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: S_ENDPGM 0 + $vgpr0_vgpr1 = V_DIV_FMAS_F64_e64 0, $vgpr0_vgpr1, 0, $vgpr2_vgpr3, 0, $vgpr4_vgpr5, 0, 0, implicit $mode, implicit $vcc, implicit $exec + $vcc = S_CSELECT_B64 -1, 0, implicit $scc + S_ENDPGM 0 +... + +# Check low word overlap +--- +name: mask_hazard_subreg1 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_subreg1 + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GCN-NEXT: $sgpr2 = S_MOV_B32 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + $sgpr2 = S_MOV_B32 0 + S_ENDPGM 0 +... + +# Check high word overlap +--- +name: mask_hazard_subreg2 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_subreg2 + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GCN-NEXT: $sgpr3 = S_MOV_B32 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + $sgpr3 = S_MOV_B32 0 + S_ENDPGM 0 +... + +# Check multiple subreg overlap +--- +name: mask_hazard_subreg3 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_subreg3 + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GCN-NEXT: $sgpr2 = S_MOV_B32 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr3 = S_MOV_B32 0 + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + $sgpr2 = S_MOV_B32 0 + $sgpr3 = S_MOV_B32 0 + S_ENDPGM 0 +... + +# Check vcc_lo overlap +--- +name: mask_hazard_subreg4 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_subreg4 + ; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + ; GCN-NEXT: $vcc_lo = S_MOV_B32 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr2 = S_MOV_B32 $vcc_lo + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + $vcc_lo = S_MOV_B32 0 + $sgpr2 = S_MOV_B32 $vcc_lo + S_ENDPGM 0 +... + +# Check vcc_hi overlap +--- +name: mask_hazard_subreg5 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_subreg5 + ; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + ; GCN-NEXT: $vcc_hi = S_MOV_B32 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr2 = S_MOV_B32 $vcc_hi + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + $vcc_hi = S_MOV_B32 0 + $sgpr2 = S_MOV_B32 $vcc_hi + S_ENDPGM 0 +... + +# S_WAITCNT does not mitigate hazard +--- +name: mask_hazard_waitcnt +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_waitcnt + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + S_WAITCNT 0 + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +# Check implicit $exec +--- +name: mask_hazard_gap1 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_gap1 + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + $vgpr2 = V_MOV_B32_e32 0, implicit $exec + $vgpr3 = V_MOV_B32_e32 0, implicit $exec + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +# Check implicit $mode +--- +name: mask_hazard_gap2 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_gap2 + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit $mode + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit $mode + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +# Check explicit $exec +--- +name: mask_hazard_gap3 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_gap3 + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + ; GCN-NEXT: $vgpr2 = V_WRITELANE_B32 $exec_lo, 0, $vgpr2 + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec + $vgpr2 = V_WRITELANE_B32 $exec_lo, 0, $vgpr2 + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + S_ENDPGM 0 +... + +# Different SGPR write +--- +name: mask_hazard_no_hazard1 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_no_hazard1 + ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + ; GCN-NEXT: $sgpr0 = S_MOV_B32 0 + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec + $sgpr0 = S_MOV_B32 0 + S_ENDPGM 0 +... + +# Different SGPR write with mask read overlap +--- +name: mask_hazard_no_hazard2 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_no_hazard2 + ; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + ; GCN-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $vcc + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + $sgpr0_sgpr1 = S_MOV_B64 $vcc + S_ENDPGM 0 +... + +# Overlapping VGPR write +--- +name: mask_hazard_no_hazard3 +body: | + bb.0: + ; GCN-LABEL: name: mask_hazard_no_hazard3 + ; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec + $vgpr2 = V_MOV_B32_e32 0, implicit $exec + S_ENDPGM 0 +...