diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp @@ -81,6 +81,32 @@ } } +static void removeExportDependencies(ScheduleDAGInstrs *DAG, SUnit &SU) { + SmallVector ToAdd, ToRemove; + + for (const SDep &Pred : SU.Preds) { + SUnit *PredSU = Pred.getSUnit(); + if (Pred.isBarrier() && isExport(*PredSU)) { + ToRemove.push_back(Pred); + if (isExport(SU)) + continue; + + // If we remove a barrier we need to copy dependencies + // from the predecessor to maintain order. + for (const SDep &ExportPred : PredSU->Preds) { + SUnit *ExportPredSU = ExportPred.getSUnit(); + if (ExportPred.isBarrier() && !isExport(*ExportPredSU)) + ToAdd.push_back(SDep(ExportPredSU, SDep::Barrier)); + } + } + } + + for (SDep Pred : ToRemove) + SU.removePred(Pred); + for (SDep Pred : ToAdd) + DAG->addEdge(&SU, Pred); +} + void ExportClustering::apply(ScheduleDAGInstrs *DAG) { const SIInstrInfo *TII = static_cast(DAG->TII); @@ -92,20 +118,18 @@ // on exports. Edges will be added later to order the exports. unsigned PosCount = 0; for (SUnit &SU : DAG->SUnits) { - if (isExport(SU)) { - Chain.push_back(&SU); - if (isPositionExport(TII, &SU)) - PosCount++; - } + if (!isExport(SU)) + continue; - SmallVector ToRemove; - for (const SDep &Pred : SU.Preds) { - SUnit *PredSU = Pred.getSUnit(); - if (Pred.isBarrier() && isExport(*PredSU)) - ToRemove.push_back(Pred); - } - for (SDep Pred : ToRemove) - SU.removePred(Pred); + Chain.push_back(&SU); + if (isPositionExport(TII, &SU)) + PosCount++; + + removeExportDependencies(DAG, SU); + + SmallVector Succs(SU.Succs); + for (SDep Succ : Succs) + removeExportDependencies(DAG, *Succ.getSUnit()); } // Apply clustering if there are multiple exports diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll @@ -602,6 +602,27 @@ ret void } +; GCN-LABEL: {{^}}test_export_across_store_load: +; GCN: buffer_store +; GCN: buffer_load +; GCN: exp pos0 +; GCN: exp param0 +; GCN: exp param1 +define amdgpu_kernel void @test_export_across_store_load(i32 %idx, float %v) #0 { + %data0 = alloca <4 x float>, align 8, addrspace(5) + %data1 = alloca <4 x float>, align 8, addrspace(5) + %cmp = icmp eq i32 %idx, 1 + %data = select i1 %cmp, <4 x float> addrspace(5)* %data0, <4 x float> addrspace(5)* %data1 + %sptr = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %data, i32 0, i32 0 + store float %v, float addrspace(5)* %sptr, align 8 + call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float 0.0, float 0.0, float 0.0, float 1.0, i1 true, i1 false) + %ptr0 = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %data0, i32 0, i32 0 + %load0 = load float, float addrspace(5)* %ptr0, align 8 + call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %load0, float 0.0, float 1.0, float 0.0, i1 false, i1 false) + call void @llvm.amdgcn.exp.f32(i32 33, i32 15, float %load0, float 0.0, float 1.0, float 0.0, i1 false, i1 false) + ret void +} + attributes #0 = { nounwind } attributes #1 = { nounwind inaccessiblememonly } attributes #2 = { nounwind readnone }