diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/PriorityQueue.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AliasAnalysis.h" @@ -1525,7 +1526,9 @@ void apply(ScheduleDAGInstrs *DAGInstrs) override; protected: - void clusterNeighboringMemOps(ArrayRef MemOps, ScheduleDAGInstrs *DAG); + void clusterNeighboringMemOps(ArrayRef MemOps, + ScheduleDAGInstrs *DAG, + SmallBitVector &ClusteredUnits); }; class StoreClusterMutation : public BaseMemOpClusterMutation { @@ -1562,9 +1565,13 @@ } // end namespace llvm void BaseMemOpClusterMutation::clusterNeighboringMemOps( - ArrayRef MemOps, ScheduleDAGInstrs *DAG) { + ArrayRef MemOps, ScheduleDAGInstrs *DAG, + SmallBitVector &ClusteredUnits) { SmallVector MemOpRecords; for (SUnit *SU : MemOps) { + // Skip those already clustered + if (ClusteredUnits.test(SU->NodeNum)) + continue; SmallVector BaseOps; int64_t Offset; if (TII->getMemOperandsWithOffset(*SU->getInstr(), BaseOps, Offset, TRI)) @@ -1602,6 +1609,8 @@ DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial)); } ++ClusterLength; + ClusteredUnits.set(SUa->NodeNum); + ClusteredUnits.set(SUb->NodeNum); } else ClusterLength = 1; } else @@ -1630,9 +1639,16 @@ Chain.push_back(&SU); } - // Iterate over the store chains. - for (auto &SCD : StoreChains) - clusterNeighboringMemOps(SCD.second, DAG); + // Iterate over the store chains. Each time, insert units without any ctrl + // preds into other groups. + SmallBitVector ClusteredUnits(DAG->SUnits.size()); + const auto &Free = StoreChains.FindAndConstruct(DAG->SUnits.size()).second; + for (auto &SCD : StoreChains) { + if (SCD.first != DAG->SUnits.size()) + for (SUnit *S : Free) + SCD.second.push_back(S); + clusterNeighboringMemOps(SCD.second, DAG, ClusteredUnits); + } } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll b/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll --- a/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll +++ b/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll @@ -3,15 +3,17 @@ define dso_local void @jsimd_idct_ifast_neon_intrinsic(i8* nocapture readonly %dct_table, i16* nocapture readonly %coef_block, i8** nocapture readonly %output_buf, i32 %output_col) local_unnamed_addr #0 { ; CHECK-LABEL: jsimd_idct_ifast_neon_intrinsic: -; CHECK: // %bb.0: // %entry +; CHECK: .Ljsimd_idct_ifast_neon_intrinsic$local: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: ldr q0, [x1, #32] -; CHECK-NEXT: ldr q1, [x1, #96] -; CHECK-NEXT: ldr q2, [x0, #32] +; CHECK-NEXT: ldr q1, [x0, #32] +; CHECK-NEXT: ldr q2, [x1, #96] ; CHECK-NEXT: ldr q3, [x0, #96] ; CHECK-NEXT: ldr x8, [x2, #48] +; CHECK-NEXT: mul v0.8h, v1.8h, v0.8h ; CHECK-NEXT: mov w9, w3 -; CHECK-NEXT: mul v0.8h, v2.8h, v0.8h -; CHECK-NEXT: mul v1.8h, v3.8h, v1.8h +; CHECK-NEXT: mul v1.8h, v3.8h, v2.8h ; CHECK-NEXT: add v2.8h, v0.8h, v1.8h ; CHECK-NEXT: str q2, [x8, x9] ; CHECK-NEXT: ldr x8, [x2, #56] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll @@ -1595,19 +1595,19 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v4, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: ds_inc_rtn_u32 v4, v1, v0 -; GFX9-NEXT: ds_inc_rtn_u32 v5, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-NEXT: ds_inc_rtn_u32 v6, v5, v4 +; GFX9-NEXT: ds_inc_rtn_u32 v4, v5, v4 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NEXT: global_store_dword v[0:1], v4, off +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v[2:3], v5, off +; GFX9-NEXT: global_store_dword v[2:3], v4, off ; GFX9-NEXT: s_endpgm %result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false) %result1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll b/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll --- a/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll +++ b/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll @@ -7,7 +7,6 @@ ; GCN-LABEL: {{^}}store_bitcast_constant_v8i32_to_v8f32: ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 -; GCN-NOT: v_mov_b32 ; GCN: buffer_store_dwordx4 ; GCN-NOT: v_mov_b32 ; GCN: buffer_store_dwordx4 @@ -23,7 +22,6 @@ ; GCN-LABEL: {{^}}store_bitcast_constant_v4i64_to_v8f32: ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 -; GCN-NOT: v_mov_b32 ; GCN: buffer_store_dwordx4 ; GCN-NOT: v_mov_b32 ; GCN: buffer_store_dwordx4 @@ -39,7 +37,6 @@ ; GCN-LABEL: {{^}}store_bitcast_constant_v4i64_to_v4f64: ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 -; GCN-NOT: v_mov_b32 ; GCN: buffer_store_dwordx4 ; GCN-NOT: v_mov_b32 ; GCN: buffer_store_dwordx4 @@ -55,7 +52,6 @@ ; GCN-LABEL: {{^}}store_bitcast_constant_v8i32_to_v16i16: ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 -; GCN-NOT: v_mov_b32 ; GCN: buffer_store_dwordx4 ; GCN-NOT: v_mov_b32 ; GCN: buffer_store_dwordx4 diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -491,12 +491,12 @@ ; GCN: enable_vgpr_workitem_id = 0 ; GCN-DAG: s_mov_b32 s33, s7 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} +; GCN: s_add_u32 s32, s33, 0x400{{$}} ; GCN: buffer_store_dword [[K]], off, s[0:3], s33 offset:4 +; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33 offset:4 -; GCN: s_add_u32 s32, s33, 0x400{{$}} ; GCN-NOT: s32 -; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}} ; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]], @@ -520,8 +520,8 @@ ; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} ; GCN: buffer_store_dword [[K]], off, s[0:3], s34{{$}} -; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s34{{$}} ; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s34{{$}} ; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}} ; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]], ; GCN: s_swappc_b64 diff --git a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll --- a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll +++ b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll @@ -51,8 +51,8 @@ ; Same frame index is used multiple times in the store ; GCN-LABEL: {{^}}stored_fi_to_self: ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x4d2{{$}} -; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}} +; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} ; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} define amdgpu_kernel void @stored_fi_to_self() #0 { %tmp = alloca i32 addrspace(5)*, addrspace(5) @@ -66,9 +66,9 @@ ; GCN-LABEL: {{^}}stored_fi_to_self_offset: ; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 32{{$}} -; GCN: buffer_store_dword [[K0]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} - ; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x4d2{{$}} + +; GCN: buffer_store_dword [[K0]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} ; GCN: buffer_store_dword [[K1]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2052{{$}} ; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x804{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -364,13 +364,13 @@ ; GCN-LABEL: chain_hi_to_lo_group_may_alias_store: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v3, 0x7b -; GCN-NEXT: ds_read_u16 v2, v0 -; GCN-NEXT: ds_write_b16 v1, v3 +; GCN-NEXT: v_mov_b32_e32 v2, 0x7b +; GCN-NEXT: ds_read_u16 v3, v0 +; GCN-NEXT: ds_write_b16 v1, v2 ; GCN-NEXT: ds_read_u16 v0, v0 offset:2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GCN-NEXT: v_lshl_or_b32 v0, v3, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1 diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll @@ -98,8 +98,8 @@ ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN-NOT: {{s|flat|buffer|global}}_load ; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[VAL]], 16 -; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], s{{[0-9]+}} ; GCN-DAG: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]] +; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], [[VAL]] ; GCN: buffer_store_byte [[V_ELT2]] ; GCN: buffer_store_byte [[V_LOAD0]] define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -20,22 +20,19 @@ ; GFX7-UNALIGNED-LABEL: global_load_2xi16_align2: ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-UNALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v0 -; GFX7-UNALIGNED-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX7-UNALIGNED-NEXT: flat_load_ushort v0, v[0:1] -; GFX7-UNALIGNED-NEXT: flat_load_ushort v1, v[2:3] +; GFX7-UNALIGNED-NEXT: flat_load_dword v0, v[0:1] ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-UNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-UNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_load_2xi16_align2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:2 +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-NEXT: v_bfi_b32 v1, v1, 0, v0 +; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1 %p.0 = load i16, i16 addrspace(1)* %p, align 2 @@ -52,45 +49,37 @@ ; GFX7-ALIGNED-LABEL: global_store_2xi16_align2: ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v4, 1 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v5, 2 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-ALIGNED-NEXT: s_add_u32 s2, s0, 2 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-ALIGNED-NEXT: flat_store_short v[0:1], v2 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-ALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 2 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-ALIGNED-NEXT: flat_store_short v[0:1], v2 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-ALIGNED-NEXT: flat_store_short v[0:1], v4 +; GFX7-ALIGNED-NEXT: flat_store_short v[2:3], v5 ; GFX7-ALIGNED-NEXT: s_endpgm ; ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align2: ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 -; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 1 +; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-UNALIGNED-NEXT: s_add_u32 s2, s0, 2 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-UNALIGNED-NEXT: flat_store_short v[0:1], v2 -; GFX7-UNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 2 -; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-UNALIGNED-NEXT: flat_store_short v[0:1], v2 +; GFX7-UNALIGNED-NEXT: flat_store_dword v[0:1], v2 ; GFX7-UNALIGNED-NEXT: s_endpgm ; ; GFX9-LABEL: global_store_2xi16_align2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX9-NEXT: v_mov_b32_e32 v2, 1 -; GFX9-NEXT: v_mov_b32_e32 v3, 2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_short v[0:1], v2, off -; GFX9-NEXT: global_store_short v[0:1], v3, off offset:2 +; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %gep.r = getelementptr i16, i16 addrspace(1)* %r, i64 1 store i16 1, i16 addrspace(1)* %r, align 2 @@ -155,8 +144,9 @@ ; GFX7-ALIGNED-LABEL: global_store_2xi16_align1: ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v4, 1 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v5, 0 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v8, 1 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v9, 0 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v10, 2 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-ALIGNED-NEXT: s_add_u32 s2, s0, 2 ; GFX7-ALIGNED-NEXT: s_addc_u32 s3, s1, 0 @@ -165,18 +155,17 @@ ; GFX7-ALIGNED-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-ALIGNED-NEXT: s_add_u32 s0, s0, 3 +; GFX7-ALIGNED-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v7, s3 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v4 -; GFX7-ALIGNED-NEXT: flat_store_byte v[2:3], v5 -; GFX7-ALIGNED-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v4, 2 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v5 -; GFX7-ALIGNED-NEXT: flat_store_byte v[2:3], v4 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v6, s2 +; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v8 +; GFX7-ALIGNED-NEXT: flat_store_byte v[2:3], v9 +; GFX7-ALIGNED-NEXT: flat_store_byte v[4:5], v9 +; GFX7-ALIGNED-NEXT: flat_store_byte v[6:7], v10 ; GFX7-ALIGNED-NEXT: s_endpgm ; ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align1: diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr.ll b/llvm/test/CodeGen/AMDGPU/global-saddr.ll --- a/llvm/test/CodeGen/AMDGPU/global-saddr.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr.ll @@ -45,8 +45,8 @@ store volatile i64 %add7, i64 addrspace(1)* %ptr9 ; Test various offset boundaries. -; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:4088{{$}} ; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2040{{$}} +; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:4088{{$}} ; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:4088{{$}} %gep11 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 511 %load11 = load i64, i64 addrspace(1)* %gep11 diff --git a/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir --- a/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir @@ -35,12 +35,12 @@ ; CHECK: [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_]] ; CHECK: undef %6.sub0:vreg_64 = V_ADD_F32_e32 [[DEF]].sub0, [[COPY1]].sub0, implicit $exec ; CHECK: dead undef %6.sub1:vreg_64 = V_ADD_F32_e32 [[DEF]].sub1, [[COPY1]].sub0, implicit $exec ; CHECK: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, 0, 0, implicit $exec + ; CHECK: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK: undef %19.sub0:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD1]], [[GLOBAL_LOAD_DWORDX2_]].sub0, implicit $exec ; CHECK: %19.sub1:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD]], implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll --- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll @@ -341,16 +341,15 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bfe_i32 s3, s0, 0x80008 ; SI-NEXT: s_ashr_i32 s1, s0, 24 ; SI-NEXT: s_bfe_i32 s2, s0, 0x80010 -; SI-NEXT: s_bfe_i32 s3, s0, 0x80008 ; SI-NEXT: s_sext_i32_i8 s0, s0 ; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt expcnt(0) @@ -469,13 +468,12 @@ ; SI-NEXT: s_ashr_i64 s[4:5], s[6:7], 48 ; SI-NEXT: s_ashr_i32 s5, s6, 16 ; SI-NEXT: s_sext_i32_i16 s6, s6 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: s_sext_i32_i16 s7, s7 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt expcnt(0) @@ -493,12 +491,12 @@ ; VI-NEXT: s_ashr_i32 s5, s6, 16 ; VI-NEXT: s_sext_i32_i16 s6, s6 ; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_ashr_i32 s4, s7, 16 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; VI-NEXT: v_mov_b32_e32 v0, s5 ; VI-NEXT: s_sext_i32_i16 s7, s7 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: v_mov_b32_e32 v0, s4