Index: llvm/lib/CodeGen/MachineScheduler.cpp =================================================================== --- llvm/lib/CodeGen/MachineScheduler.cpp +++ llvm/lib/CodeGen/MachineScheduler.cpp @@ -1579,16 +1579,30 @@ DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) { LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU(" << SUb->NodeNum << ")\n"); - // Copy successor edges from SUa to SUb. Interleaving computation - // dependent on SUa can prevent load combining due to register reuse. - // Predecessor edges do not need to be copied from SUb to SUa since nearby - // loads should have effectively the same inputs. - for (const SDep &Succ : SUa->Succs) { - if (Succ.getSUnit() == SUb) - continue; - LLVM_DEBUG(dbgs() << " Copy Succ SU(" << Succ.getSUnit()->NodeNum - << ")\n"); - DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial)); + if (IsLoad) { + // Copy successor edges from SUa to SUb. Interleaving computation + // dependent on SUa can prevent load combining due to register reuse. + // Predecessor edges do not need to be copied from SUb to SUa since + // nearby loads should have effectively the same inputs. + for (const SDep &Succ : SUa->Succs) { + if (Succ.getSUnit() == SUb) + continue; + LLVM_DEBUG(dbgs() << " Copy Succ SU(" << Succ.getSUnit()->NodeNum + << ")\n"); + DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial)); + } + } else { + // Copy predecessor edges from SUb to SUa to avoid the SUnits that + // SUb dependent on scheduled in-between SUb and SUa. Successor edges + // do not need to be copied from SUa to SUb since no one will depend + // on stores. + for (const SDep &Pred : SUb->Preds) { + if (Pred.getSUnit() == SUa) + continue; + LLVM_DEBUG(dbgs() << " Copy Pred SU(" << Pred.getSUnit()->NodeNum + << ")\n"); + DAG->addEdge(SUa, SDep(Pred.getSUnit(), SDep::Artificial)); + } } ++ClusterLength; } else Index: llvm/test/CodeGen/AArch64/macro-fusion.ll =================================================================== --- llvm/test/CodeGen/AArch64/macro-fusion.ll +++ llvm/test/CodeGen/AArch64/macro-fusion.ll @@ -18,3 +18,22 @@ %sub = sub nsw i32 %add1, %d ret i32 %sub } + +; Verify that, the load/store cluster createds the necessary dependencies +; between two fused stores. +define void @test_cluster(i32* %p, i32 %m, i32 %n) { +entry: +; CHECK: ********** MI Scheduling ********** +; CEHCK-LABEL: test_cluster:%bb.0 entry +; CHECK: Cluster ld/st SU([[SU3:[0-9]+]]) - SU([[SU4:[0-9]+]]) +; CHECK: Copy Pred SU([[SU2:[0-9]+]]) +; CHECK: Copy Pred SU([[SU0:[0-9]+]]) +; CHECK: SU([[SU0]]): %{{[0-9]+}}:gpr32 = COPY $w2 +; CHECK: SU([[SU2]]): %{{[0-9]+}}:gpr64common = COPY $x0 +; CHECK: SU([[SU3]]): STRWui +; CHECK: SU([[SU4]]): STRWui + store i32 %m, i32* %p, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %p, i64 1 + store i32 %n, i32* %arrayidx1, align 4 + ret void +} Index: llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll +++ llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll @@ -133,10 +133,10 @@ ; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8 ; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12 -; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s34 offset:20 +; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s34 offset:16 ; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s34 offset:24 ; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s34 offset:28 -; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s34 offset:16 +; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s34 offset:20 ; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16 ; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20 @@ -331,12 +331,12 @@ ; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8 ; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12 -; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s34 offset:20 +; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s34 offset:16 ; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s34 offset:24 ; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s34 offset:28 -; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s34 offset:16 +; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s34 offset:20 -; GCN: s_waitcnt vmcnt(0) +; GCN: s_waitcnt vmcnt(3) ; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16 ; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20 ; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:24 Index: llvm/test/CodeGen/AMDGPU/max.i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/max.i16.ll +++ llvm/test/CodeGen/AMDGPU/max.i16.ll @@ -134,25 +134,25 @@ ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v7, s5 -; VI-NEXT: v_add_u32_e32 v6, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; VI-NEXT: flat_load_dword v8, v[0:1] -; VI-NEXT: flat_load_ushort v9, v[4:5] +; VI-NEXT: flat_load_dword v7, v[0:1] +; VI-NEXT: flat_load_ushort v4, v[4:5] ; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; VI-NEXT: flat_load_dword v5, v[2:3] +; VI-NEXT: flat_load_ushort v8, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v6 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_max_i16_e32 v1, v8, v2 -; VI-NEXT: v_max_i16_sdwa v2, v8, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_max_i16_e32 v6, v7, v5 +; VI-NEXT: v_max_i16_sdwa v5, v7, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_i16_e32 v0, v9, v0 -; VI-NEXT: flat_store_dword v[6:7], v1 -; VI-NEXT: flat_store_short v[4:5], v0 +; VI-NEXT: v_max_i16_e32 v4, v4, v8 +; VI-NEXT: flat_store_dword v[0:1], v5 +; VI-NEXT: flat_store_short v[2:3], v4 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_imax_sge_v3i16: Index: llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -94,7 +94,7 @@ ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off ; Index: llvm/test/CodeGen/AMDGPU/scratch-simple.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/scratch-simple.ll +++ llvm/test/CodeGen/AMDGPU/scratch-simple.ll @@ -14,6 +14,7 @@ ; ; GCN-LABEL: {{^}}ps_main: +; GCN-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0 ; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 ; GCN-DAG: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 ; GCN-DAG: s_mov_b32 s6, -1 @@ -23,14 +24,13 @@ ; GFX10_W32-DAG: s_mov_b32 s7, 0x31c16000 ; GFX10_W64-DAG: s_mov_b32 s7, 0x31e16000 ; GCN-NOT: s_mov_b32 s0 -; GCN-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0 ; GCN-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]] ; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], 0x200, [[CLAMP_IDX]] ; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], 0x400, [[CLAMP_IDX]] -; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, s0 offen -; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, s0 offen +; GCN-DAG: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, s0 offen +; GCN-DAG: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, s0 offen define amdgpu_ps float @ps_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx