diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -1624,16 +1624,32 @@ LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU(" << SUb->NodeNum << ")\n"); - // Copy successor edges from SUa to SUb. Interleaving computation - // dependent on SUa can prevent load combining due to register reuse. - // Predecessor edges do not need to be copied from SUb to SUa since - // nearby loads should have effectively the same inputs. - for (const SDep &Succ : SUa->Succs) { - if (Succ.getSUnit() == SUb) - continue; - LLVM_DEBUG(dbgs() << " Copy Succ SU(" << Succ.getSUnit()->NodeNum - << ")\n"); - DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial)); + if (IsLoad) { + // Copy successor edges from SUa to SUb. Interleaving computation + // dependent on SUa can prevent load combining due to register reuse. + // Predecessor edges do not need to be copied from SUb to SUa since + // nearby loads should have effectively the same inputs. + for (const SDep &Succ : SUa->Succs) { + if (Succ.getSUnit() == SUb) + continue; + LLVM_DEBUG(dbgs() << " Copy Succ SU(" << Succ.getSUnit()->NodeNum + << ")\n"); + DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial)); + } + } else { + // Copy predecessor edges from SUb to SUa to avoid the SUnits that + // SUb dependent on scheduled in-between SUb and SUa. Successor edges + // do not need to be copied from SUa to SUb since no one will depend + // on stores. + // Notice that, we don't need to care about the memory dependency as + // we won't try to cluster them if they have any memory dependency. + for (const SDep &Pred : SUb->Preds) { + if (Pred.getSUnit() == SUa) + continue; + LLVM_DEBUG(dbgs() << " Copy Pred SU(" << Pred.getSUnit()->NodeNum + << ")\n"); + DAG->addEdge(SUa, SDep(Pred.getSUnit(), SDep::Artificial)); + } } LLVM_DEBUG(dbgs() << " Curr cluster length: " << ClusterLength diff --git a/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll b/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll --- a/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll @@ -194,3 +194,22 @@ store i64 %add6.3, i64* %arrayidx5.3, align 8 ret void } + +; Verify that the SU(2) and SU(4) are the preds of SU(3) +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: stp_missing_preds_edges:%bb.0 +; CHECK:Cluster ld/st SU(3) - SU(5) +; CHECK: Copy Pred SU(4) +; CHECK: Copy Pred SU(2) +; CHECK:SU(2): %0:gpr64common = COPY $x0 +; CHECK:SU(3): STRWui %1:gpr32, %0:gpr64common, 0 +; CHECK:SU(4): %3:gpr32common = nsw ADDWri %2:gpr32common, 5, 0 +; CHECK:SU(5): STRWui %3:gpr32common, %0:gpr64common, 1 +define void @stp_missing_preds_edges(i32* %p, i32 %m, i32 %n) { +entry: + store i32 %m, i32* %p, align 4 + %add = add nsw i32 %n, 5 + %arrayidx1 = getelementptr inbounds i32, i32* %p, i64 1 + store i32 %add, i32* %arrayidx1, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -744,13 +744,13 @@ ; GCN-LABEL: {{^}}tail_call_byval_align16: ; GCN-NOT: s32 -; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:12 ; GCN: s_getpc_b64 -; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4 -; GCN: buffer_store_dword v33, off, s[0:3], s32{{$}} +; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:4 +; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}} ; GCN-NOT: s32 ; GCN: s_setpc_b64 define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -624,11 +624,10 @@ ; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7 -; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}} - ; FIXEDABI: s_movk_i32 s32, 0x400{{$}} - ; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140 +; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}} + ; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}} ; FIXME: Why this reload? @@ -670,9 +669,8 @@ ; FIXED-ABI-NOT: v31 ; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7{{$}} -; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s33{{$}} - ; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}} +; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s33{{$}} ; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}} ; FIXEDABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -1368,11 +1368,11 @@ ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; GFX9-NEXT: buffer_store_byte_d16_hi v2, v0, s[0:3], 0 offen offset:5 -; GFX9-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:4 ; GFX9-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:3 ; GFX9-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen offset:2 ; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; GFX9-NEXT: buffer_store_byte_d16_hi v2, v0, s[0:3], 0 offen offset:5 +; GFX9-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -307,6 +307,7 @@ ; GCN: flat_load_dwordx4 ; GCN: flat_load_dwordx4 +; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 @@ -315,7 +316,6 @@ ; GCN: flat_store_dwordx4 -; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -160,16 +160,16 @@ ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s33, 2 ; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s34, 3 +; GCN: s_mov_b32 s34, s32 ; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0 ; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000 - -; GCN: s_mov_b32 s34, s32 -; GCN-NEXT: v_mov_b32_e32 v32, 0 - -; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024 ; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34 ; GCN-NEXT: s_add_u32 s32, s32, 0x30000 +; GCN: v_mov_b32_e32 v33, 0 + +; GCN: buffer_store_dword v33, off, s[0:3], s33 offset:1024 + ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]