Index: llvm/lib/CodeGen/MachineScheduler.cpp =================================================================== --- llvm/lib/CodeGen/MachineScheduler.cpp +++ llvm/lib/CodeGen/MachineScheduler.cpp @@ -1579,6 +1579,31 @@ llvm::sort(MemOpRecords); unsigned ClusterLength = 1; + SmallVector ClusteredChain; + + // Copy successor edges from all SU in a cluster to the last SU. The last + // SUnit becomes a sentinel for the whole cluster. + // Interleaving computation dependent on a SU inside the cluster can prevent + // load combining due to register reuse. Predecessor edges do not need to be + // copied from the last SU to all ofther nodes since nearby loads should have + // effectively the same inputs. + auto transferSuccessors = [DAG, &ClusteredChain]()->void { + if (ClusteredChain.empty()) + return; + + SUnit *SU = ClusteredChain.pop_back_val(); + while (!ClusteredChain.empty()) { + SUnit *SUa = ClusteredChain.pop_back_val(); + for (const SDep &Succ : SUa->Succs) { + if (Succ.getSUnit() == SU) + continue; + LLVM_DEBUG(dbgs() + << " Copy Succ SU(" << Succ.getSUnit()->NodeNum << ")\n"); + DAG->addEdge(Succ.getSUnit(), SDep(SU, SDep::Artificial)); + } + } + }; + for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) { SUnit *SUa = MemOpRecords[Idx].SU; SUnit *SUb = MemOpRecords[Idx+1].SU; @@ -1587,26 +1612,24 @@ ClusterLength + 1)) { if (SUa->NodeNum > SUb->NodeNum) std::swap(SUa, SUb); + if (ClusterLength == 1) + ClusteredChain.push_back(SUa); + ClusteredChain.push_back(SUb); if (DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) { LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU(" << SUb->NodeNum << ")\n"); - // Copy successor edges from SUa to SUb. Interleaving computation - // dependent on SUa can prevent load combining due to register reuse. - // Predecessor edges do not need to be copied from SUb to SUa since - // nearby loads should have effectively the same inputs. - for (const SDep &Succ : SUa->Succs) { - if (Succ.getSUnit() == SUb) - continue; - LLVM_DEBUG(dbgs() - << " Copy Succ SU(" << Succ.getSUnit()->NodeNum << ")\n"); - DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial)); - } ++ClusterLength; - } else + } else { + transferSuccessors(); ClusterLength = 1; - } else + } + } else { + transferSuccessors(); ClusterLength = 1; + } } + + transferSuccessors(); } /// Callback from DAG postProcessing to create cluster edges for loads. Index: llvm/test/CodeGen/AMDGPU/cluster_stores.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/cluster_stores.ll @@ -0,0 +1,68 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s 2>&1 | FileCheck --enable-var-scope --check-prefix=GCN %s + +; GCN-LABEL: {{^}}cluster_load_cluster_store: +; GCN: flat_load_dword [[LD1:v[0-9]+]], v[{{[0-9:]+}}] +; GCN-NEXT: flat_load_dword [[LD2:v[0-9]+]], v[{{[0-9:]+}}] offset:8 +; GCN-NEXT: flat_load_dword [[LD3:v[0-9]+]], v[{{[0-9:]+}}] offset:16 +; GCN-NEXT: flat_load_dword [[LD4:v[0-9]+]], v[{{[0-9:]+}}] offset:24 +; GCN: flat_store_dword v[{{[0-9:]+}}], [[LD1]] +; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD2]] offset:8 +; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD3]] offset:16 +; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD4]] offset:24 +define amdgpu_kernel void @cluster_load_cluster_store(i32* noalias %lb, i32* noalias %sb) { +bb: + %la0 = getelementptr inbounds i32, i32* %lb, i32 0 + %ld0 = load i32, i32* %la0 + %la1 = getelementptr inbounds i32, i32* %lb, i32 2 + %ld1 = load i32, i32* %la1 + %la2 = getelementptr inbounds i32, i32* %lb, i32 4 + %ld2 = load i32, i32* %la2 + %la3 = getelementptr inbounds i32, i32* %lb, i32 6 + %ld3 = load i32, i32* %la3 + %sa0 = getelementptr inbounds i32, i32* %sb, i32 0 + + store i32 %ld0, i32* %sa0 + %sa1 = getelementptr inbounds i32, i32* %sb, i32 2 + store i32 %ld1, i32* %sa1 + %sa2 = getelementptr inbounds i32, i32* %sb, i32 4 + store i32 %ld2, i32* %sa2 + %sa3 = getelementptr inbounds i32, i32* %sb, i32 6 + store i32 %ld3, i32* %sa3 + + ret void +} + +; GCN-LABEL: {{^}}cluster_load_valu_cluster_store: +; GCN: flat_load_dword [[LD1:v[0-9]+]], v[{{[0-9:]+}}] +; GCN-NEXT: flat_load_dword [[LD2:v[0-9]+]], v[{{[0-9:]+}}] offset:8 +; GCN-NEXT: flat_load_dword [[LD3:v[0-9]+]], v[{{[0-9:]+}}] offset:16 +; GCN-NEXT: flat_load_dword [[LD4:v[0-9]+]], v[{{[0-9:]+}}] offset:24 +; GCN: v_add_u32_e32 [[ST2:v[0-9]+]], 1, [[LD2]] +; GCN: flat_store_dword v[{{[0-9:]+}}], [[LD1]] +; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD3]] offset:16 +; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[ST2]] offset:8 +; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD4]] offset:24 +define amdgpu_kernel void @cluster_load_valu_cluster_store(i32* noalias %lb, i32* noalias %sb) { +bb: + %la0 = getelementptr inbounds i32, i32* %lb, i32 0 + %ld0 = load i32, i32* %la0 + %la1 = getelementptr inbounds i32, i32* %lb, i32 2 + %ld1 = load i32, i32* %la1 + %la2 = getelementptr inbounds i32, i32* %lb, i32 4 + %ld2 = load i32, i32* %la2 + %la3 = getelementptr inbounds i32, i32* %lb, i32 6 + %ld3 = load i32, i32* %la3 + %sa0 = getelementptr inbounds i32, i32* %sb, i32 0 + + store i32 %ld0, i32* %sa0 + %sa1 = getelementptr inbounds i32, i32* %sb, i32 2 + + %add = add i32 %ld1, 1 + store i32 %add, i32* %sa1 + %sa2 = getelementptr inbounds i32, i32* %sb, i32 4 + store i32 %ld2, i32* %sa2 + %sa3 = getelementptr inbounds i32, i32* %sb, i32 6 + store i32 %ld3, i32* %sa3 + + ret void +} Index: llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -86,9 +86,9 @@ ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 @@ -225,11 +225,11 @@ ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 ; ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024