Index: lib/Target/AMDGPU/GCNHazardRecognizer.cpp =================================================================== --- lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -901,6 +901,7 @@ const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU()); const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName); if (!SDST) { for (const auto &MO : MI->implicit_operands()) { @@ -919,22 +920,37 @@ return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI); }; - // This assumes that there will be s_waitcnt lgkmcnt(0) or equivalent - // between any at risk SMEM and any SALU dependent on the SMEM results. - auto IsExpiredFn = [TII] (MachineInstr *MI, int) { + auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) { if (MI) { if (TII->isSALU(*MI)) { - if (TII->isSOPP(*MI)) - return false; switch (MI->getOpcode()) { case AMDGPU::S_SETVSKIP: case AMDGPU::S_VERSION: case AMDGPU::S_WAITCNT_VSCNT: case AMDGPU::S_WAITCNT_VMCNT: case AMDGPU::S_WAITCNT_EXPCNT: - case AMDGPU::S_WAITCNT_LGKMCNT: + // These instructions cannot not mitigate the hazard. return false; + case AMDGPU::S_WAITCNT_LGKMCNT: + // Reducing lgkmcnt count to 0 always mitigates the hazard. + return (MI->getOperand(1).getImm() == 0) && + (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL); + case AMDGPU::S_WAITCNT: { + const int64_t Imm = MI->getOperand(0).getImm(); + AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm); + return (Decoded.LgkmCnt == 0); + } default: + // SOPP instructions cannot mitigate the hazard. + if (TII->isSOPP(*MI)) + return false; + // At this point the SALU can be assumed to mitigate the hazard + // because either: + // (a) it is independent of the at risk SMEM (breaking chain), + // or + // (b) it is dependent on the SMEM, in which case an appropriate + // s_waitcnt lgkmcnt _must_ exist between it and the at risk + // SMEM instruction. return true; } } Index: test/CodeGen/AMDGPU/smem-war-hazard.mir =================================================================== --- test/CodeGen/AMDGPU/smem-war-hazard.mir +++ test/CodeGen/AMDGPU/smem-war-hazard.mir @@ -29,13 +29,13 @@ S_ENDPGM 0 ... -# GCN-LABEL: name: hazard_smem_war_related_clause +# GCN-LABEL: name: hazard_smem_war_dependent_salu # GCN: S_LOAD_DWORD_IMM # GCN-NEXT: S_WAITCNT # GCN-NEXT: S_ADD_U32 # GCN-NEXT: V_CMP_EQ_F32 --- -name: hazard_smem_war_related_clause +name: hazard_smem_war_dependent_salu body: | bb.0: liveins: $sgpr0, $sgpr1, $sgpr4, $vgpr0, $vgpr1 @@ -46,19 +46,128 @@ S_ENDPGM 0 ... -# GCN-LABEL: name: hazard_smem_war_related_clause_vmcnt +# GCN-LABEL: name: hazard_smem_war_independent_salu # GCN: S_LOAD_DWORD_IMM -# GCN-NEXT: S_WAITCNT 3952{{$}} +# GCN-NEXT: S_WAITCNT # GCN-NEXT: S_ADD_U32 # GCN-NEXT: V_CMP_EQ_F32 --- -name: hazard_smem_war_related_clause_vmcnt +name: hazard_smem_war_independent_salu body: | bb.0: - liveins: $sgpr0, $sgpr1, $sgpr4, $vgpr0, $vgpr1 + liveins: $sgpr0, $sgpr1, $sgpr4, $sgpr5, $vgpr0, $vgpr1 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0 + S_WAITCNT 0 + $sgpr3 = S_ADD_U32 $sgpr5, $sgpr4, implicit-def $scc + $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $exec + S_ENDPGM 0 +... + +# GCN-LABEL: name: hazard_smem_war_only_smem +# GCN: S_LOAD_DWORD_IMM +# GCN-NEXT: S_LOAD_DWORD_IMM +# GCN-NEXT: $sgpr_null = S_MOV_B32 0 +# GCN-NEXT: V_CMP_EQ_F32 +--- +name: hazard_smem_war_only_smem +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr6, $sgpr7, $vgpr0, $vgpr1 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0 + $sgpr5 = S_LOAD_DWORD_IMM $sgpr6_sgpr7, 0, 0, 0 + $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $exec + S_ENDPGM 0 +... + +# GCN-LABEL: name: hazard_smem_war_only_waitcnt_0 +# GCN: S_LOAD_DWORD_IMM +# GCN-NEXT: S_WAITCNT +# GCN-NEXT: V_CMP_EQ_F32 +--- +name: hazard_smem_war_only_waitcnt_0 +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0 + S_WAITCNT 0 + $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $exec + S_ENDPGM 0 +... + +# GCN-LABEL: name: hazard_smem_war_only_vmcnt_0 +# GCN: S_LOAD_DWORD_IMM +# GCN-NEXT: S_WAITCNT 3952{{$}} +# GCN-NEXT: $sgpr_null = S_MOV_B32 0 +# GCN-NEXT: V_CMP_EQ_F32 +--- +name: hazard_smem_war_only_vmcnt_0 +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0 S_WAITCNT 3952 - $sgpr3 = S_ADD_U32 $sgpr2, $sgpr4, implicit-def $scc + $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $exec + S_ENDPGM 0 +... + +# GCN-LABEL: name: hazard_smem_war_only_expcnt_0 +# GCN: S_LOAD_DWORD_IMM +# GCN-NEXT: S_WAITCNT 53007{{$}} +# GCN-NEXT: $sgpr_null = S_MOV_B32 0 +# GCN-NEXT: V_CMP_EQ_F32 +--- +name: hazard_smem_war_only_expcnt_0 +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0 + S_WAITCNT 53007 + $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $exec + S_ENDPGM 0 +... + +# GCN-LABEL: name: hazard_smem_war_only_lgkmcnt_0 +# GCN: S_LOAD_DWORD_IMM +# GCN-NEXT: S_WAITCNT 49279{{$}} +# GCN-NEXT: V_CMP_EQ_F32 +--- +name: hazard_smem_war_only_lgkmcnt_0 +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0 + S_WAITCNT 49279 + $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $exec + S_ENDPGM 0 +... + +# GCN-LABEL: name: hazard_smem_war_only_waitcnt_lgkmcnt_0 +# GCN: S_LOAD_DWORD_IMM +# GCN-NEXT: S_WAITCNT_LGKMCNT +# GCN-NEXT: V_CMP_EQ_F32 +--- +name: hazard_smem_war_only_waitcnt_lgkmcnt_0 +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0 + S_WAITCNT_LGKMCNT $sgpr_null, 0 + $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $exec + S_ENDPGM 0 +... + +# GCN-LABEL: name: hazard_smem_war_only_waitcnt_lgkmcnt_1 +# GCN: S_LOAD_DWORD_IMM +# GCN-NEXT: S_WAITCNT_LGKMCNT +# GCN-NEXT: $sgpr_null = S_MOV_B32 0 +# GCN-NEXT: V_CMP_EQ_F32 +--- +name: hazard_smem_war_only_waitcnt_lgkmcnt_1 +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0 + S_WAITCNT_LGKMCNT $sgpr_null, 1 $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $exec S_ENDPGM 0 ...