diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -329,8 +329,8 @@ const MachineOperand *Offset1Op = getNamedOperand(LdSt, AMDGPU::OpName::offset1); - unsigned Offset0 = Offset0Op->getImm(); - unsigned Offset1 = Offset1Op->getImm(); + unsigned Offset0 = Offset0Op->getImm() & 0xff; + unsigned Offset1 = Offset1Op->getImm() & 0xff; if (Offset0 + 1 != Offset1) return false; diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll --- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -1,10 +1,25 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +%struct.lds = type { [64 x ptr], [16 x i8] } +@stored_lds_struct = addrspace(3) global %struct.lds addrspace(3) *undef, align 16 @stored_lds_ptr = addrspace(3) global ptr addrspace(3) undef, align 4 @stored_constant_ptr = addrspace(3) global ptr addrspace(4) undef, align 8 @stored_global_ptr = addrspace(3) global ptr addrspace(1) undef, align 8 +; GCN-LABEL: {{^}}no_reorder_flat_load_local_store_local_load: +; GCN: flat_load_dwordx4 +; GCN: ds_write_b128 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:512 +; GCN: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:129 offset1:130 +define amdgpu_kernel void @no_reorder_flat_load_local_store_local_load(ptr addrspace(3) %out, ptr %fptr) #0 { + %ptr1 = getelementptr %struct.lds, ptr addrspace(3) @stored_lds_struct, i32 0, i32 1 + %ptr2 = getelementptr %struct.lds, ptr addrspace(3) @stored_lds_struct, i32 0, i32 1, i32 4 + call void @llvm.memcpy.p3.p0(ptr addrspace(3) align 16 %ptr1, ptr align 8 %fptr, i64 16, i1 false) + %vector_load = load <2 x i32>, ptr addrspace(3) %ptr2, align 4 + store <2 x i32> %vector_load, ptr addrspace(3) %out, align 4 + ret void +} + ; GCN-LABEL: {{^}}reorder_local_load_global_store_local_load: ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:3 ; CI: buffer_store_dword @@ -312,6 +327,7 @@ ret void } +declare void @llvm.memcpy.p3.p0(ptr addrspace(3), ptr, i64, i1) declare void @llvm.amdgcn.s.barrier() #1 declare i32 @llvm.amdgcn.workitem.id.x() #2 declare void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32 immarg, i32 immarg) #3 diff --git a/llvm/test/CodeGen/AMDGPU/triv-disjoint-mem-access-neg-offset.mir b/llvm/test/CodeGen/AMDGPU/triv-disjoint-mem-access-neg-offset.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/triv-disjoint-mem-access-neg-offset.mir @@ -0,0 +1,37 @@ +# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -enable-misched --debug-only=machine-scheduler -run-pass=machine-scheduler -o /dev/null %s 2>&1 | FileCheck %s +# REQUIRES: asserts + +# Make sure handling of unsigned immediate values interpreted as negative values +# still works for SIInstrInfo::areMemAccessesTriviallyDisjoint. + +# LABEL: {{^}}no_reorder_flat_load_local_store_local_load: +# CHECK: SU(5): %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: SU(6): DS_WRITE_B128_gfx9 %5:vgpr_32, %4:vreg_128, 512, 0, implicit $exec +# CHECK: SU(7): %6:vreg_64 = DS_READ2_B32_gfx9 %5:vgpr_32, -127, -126, 0, implicit $exec +# CHECK: Predecessors: +# CHECK-NEXT: SU(6): Ord +# CHECK-NEXT: SU(5): Data +--- +name: no_reorder_flat_load_local_store_local_load +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %4:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 36, 0 + %5:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1(p4), 44, 0 + %7:vreg_64 = COPY %5 + %6:vreg_128 = FLAT_LOAD_DWORDX4 killed %7, 0, 0, implicit $exec, implicit $flat_scr + %8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + DS_WRITE_B128_gfx9 %8, killed %6, 512, 0, implicit $exec + %9:vreg_64 = DS_READ2_B32_gfx9 %8, -127, -126, 0, implicit $exec + %10:sreg_32 = COPY %9.sub1 + %11:sreg_32 = COPY %9.sub0 + %12:vgpr_32 = COPY %4 + %13:vgpr_32 = COPY %11 + %14:vgpr_32 = COPY %10 + DS_WRITE2_B32_gfx9 killed %12, killed %13, killed %14, 0, 1, 0, implicit $exec + S_ENDPGM 0 + +...