diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -1123,36 +1123,26 @@ for (unsigned SUNum : DAG->TopDownIndex2SU) { const SUnit &SU = DAG->SUnits[SUNum]; if (SIInstrInfo::isEXP(*SU.getInstr())) { - // Check the EXP can be added to the group safely, - // ie without needing any other instruction. - // The EXP is allowed to depend on other EXP - // (they will be in the same group). - for (unsigned j : ExpGroup) { - bool HasSubGraph; - std::vector SubGraph; - // By construction (topological order), if SU and - // DAG->SUnits[j] are linked, DAG->SUnits[j] is necessary - // in the parent graph of SU. -#ifndef NDEBUG - SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j], - HasSubGraph); - assert(!HasSubGraph); -#endif - SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU, - HasSubGraph); - if (!HasSubGraph) - continue; // No dependencies between each other - - // SubGraph contains all the instructions required - // between EXP SUnits[j] and EXP SU. - for (unsigned k : SubGraph) { - if (!SIInstrInfo::isEXP(*DAG->SUnits[k].getInstr())) - // Other instructions than EXP would be required in the group. - // Abort the grouping. - return; + // SU is an export instruction. Check whether one of its successor + // dependencies is a non-export, in which case we skip export grouping. + for (const SDep &SuccDep : SU.Succs) { + const SUnit *SuccSU = SuccDep.getSUnit(); + if (SuccDep.isWeak() || SuccSU->NodeNum >= DAG->SUnits.size()) { + // Ignore these dependencies. + continue; + } + assert(SuccSU->isInstr() && + "SUnit unexpectedly not representing an instruction!"); + + if (!SIInstrInfo::isEXP(*SuccSU->getInstr())) { + // A non-export depends on us. Skip export grouping. + // Note that this is a bit pessimistic: We could still group all other + // exports that are not depended on by non-exports, directly or + // indirectly. Simply skipping this particular export but grouping all + // others would not account for indirect dependencies. + return; } } - ExpGroup.push_back(SUNum); } } diff --git a/llvm/test/CodeGen/AMDGPU/si-scheduler-exports.ll b/llvm/test/CodeGen/AMDGPU/si-scheduler-exports.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/si-scheduler-exports.ll @@ -0,0 +1,25 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1010 --misched=si -mattr=si-scheduler < %s | FileCheck %s + +define amdgpu_gs void @_amdgpu_gs_main() { +; CHECK-LABEL: _amdgpu_gs_main: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_mov_b32 s1, s0 +; CHECK-NEXT: s_mov_b32 s2, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, v0 +; CHECK-NEXT: v_mov_b32_e32 v2, v0 +; CHECK-NEXT: v_mov_b32_e32 v3, v0 +; CHECK-NEXT: s_mov_b32 s3, s0 +; CHECK-NEXT: exp mrt0 off, off, off, off +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen +; CHECK-NEXT: s_endpgm +entry: + call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, i1 false, i1 false) + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> zeroinitializer, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) +declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg)