diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -329,8 +329,8 @@ const MachineOperand *Offset1Op = getNamedOperand(LdSt, AMDGPU::OpName::offset1); - unsigned Offset0 = Offset0Op->getImm(); - unsigned Offset1 = Offset1Op->getImm(); + APInt Offset0(8, Offset0Op->getImm()); + APInt Offset1(8, Offset1Op->getImm()); if (Offset0 + 1 != Offset1) return false; @@ -350,7 +350,7 @@ EltSize *= 64; BaseOps.push_back(BaseOp); - Offset = EltSize * Offset0; + Offset = EltSize * Offset0.getZExtValue(); // Get appropriate operand(s), and compute width accordingly. DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); if (DataOpIdx == -1) { diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll --- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -1,10 +1,25 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +%struct.lds = type { [64 x ptr], [16 x i8] } +@stored_lds_struct = addrspace(3) global %struct.lds addrspace(3) *undef, align 16 @stored_lds_ptr = addrspace(3) global ptr addrspace(3) undef, align 4 @stored_constant_ptr = addrspace(3) global ptr addrspace(4) undef, align 8 @stored_global_ptr = addrspace(3) global ptr addrspace(1) undef, align 8 +; GCN-LABEL: {{^}}no_reorder_flat_load_local_store_local_load: +; GCN: flat_load_dwordx4 +; GCN: ds_write_b128 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:512 +; GCN: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:129 offset1:130 +define amdgpu_kernel void @no_reorder_flat_load_local_store_local_load(ptr addrspace(3) %out, ptr %fptr) #0 { + %ptr1 = getelementptr %struct.lds, ptr addrspace(3) @stored_lds_struct, i32 0, i32 1 + %ptr2 = getelementptr %struct.lds, ptr addrspace(3) @stored_lds_struct, i32 0, i32 1, i32 4 + call void @llvm.memcpy.p3.p0(ptr addrspace(3) align 16 %ptr1, ptr align 8 %fptr, i64 16, i1 false) + %vector_load = load <2 x i32>, ptr addrspace(3) %ptr2, align 4 + store <2 x i32> %vector_load, ptr addrspace(3) %out, align 4 + ret void +} + ; GCN-LABEL: {{^}}reorder_local_load_global_store_local_load: ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:3 ; CI: buffer_store_dword @@ -312,6 +327,7 @@ ret void } +declare void @llvm.memcpy.p3.p0(ptr addrspace(3), ptr, i64, i1) declare void @llvm.amdgcn.s.barrier() #1 declare i32 @llvm.amdgcn.workitem.id.x() #2 declare void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32 immarg, i32 immarg) #3 diff --git a/llvm/test/CodeGen/AMDGPU/triv-disjoint-mem-access-neg-offset.mir b/llvm/test/CodeGen/AMDGPU/triv-disjoint-mem-access-neg-offset.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/triv-disjoint-mem-access-neg-offset.mir @@ -0,0 +1,159 @@ +# RUN: llc -march=amdgcn -mcpu=gfx900 -x=mir -verify-machineinstrs -enable-misched --debug-only=machine-scheduler -start-after=amdgpu-isel -o /dev/null < %s 2>&1 | FileCheck %s +# REQUIRES: asserts + +# Make sure handling of unsigned immediate values interpreted as negative values +# still works for SIInstrInfo::areMemAccessesTriviallyDisjoint. + +# LABEL: {{^}}no_reorder_flat_load_local_store_local_load: +# CHECK: SU(5): %8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# CHECK: SU(6): DS_WRITE_B128_gfx9 %8:vgpr_32, %6:vreg_128, 512, 0, implicit $exec +# CHECK: SU(7): %9:vreg_64 = DS_READ2_B32_gfx9 %8:vgpr_32, -127, -126, 0, implicit $exec +# CHECK: Predecessors: +# CHECK-NEXT: SU(6): Ord +# CHECK-NEXT: SU(5): Data +--- | + %llvm.amdgcn.kernel.no_reorder_flat_load_local_store_local_load.lds.t = type { ptr addrspace(3) } + + @llvm.amdgcn.kernel.no_reorder_flat_load_local_store_local_load.lds = internal addrspace(3) global %llvm.amdgcn.kernel.no_reorder_flat_load_local_store_local_load.lds.t undef, align 16, !absolute_symbol !0 + + define amdgpu_kernel void @no_reorder_flat_load_local_store_local_load(ptr addrspace(3) %out, ptr %fptr) #0 { + %no_reorder_flat_load_local_store_local_load.kernarg.segment = call nonnull align 16 dereferenceable(52) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() + %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %no_reorder_flat_load_local_store_local_load.kernarg.segment, i64 36, !amdgpu.uniform !1 + %out.load = load ptr addrspace(3), ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !1 + %fptr.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %no_reorder_flat_load_local_store_local_load.kernarg.segment, i64 44, !amdgpu.uniform !1 + %fptr.load = load ptr, ptr addrspace(4) %fptr.kernarg.offset, align 4, !invariant.load !1 + call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) align 16 getelementptr (%llvm.amdgcn.kernel.no_reorder_flat_load_local_store_local_load.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.no_reorder_flat_load_local_store_local_load.lds, i32 128, i32 0), ptr align 8 %fptr.load, i64 16, i1 false) + %vector_load = load <2 x i32>, ptr addrspace(3) getelementptr (%llvm.amdgcn.kernel.no_reorder_flat_load_local_store_local_load.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.no_reorder_flat_load_local_store_local_load.lds, i32 129, i32 0), align 4 + store <2 x i32> %vector_load, ptr addrspace(3) %out.load, align 4 + ret void + } + + declare void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1 + declare align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #2 + + attributes #0 = { nounwind "amdgpu-memory-bound"="true" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-wave-limiter"="true" "target-cpu"="gfx900" "uniform-work-group-size"="false" } + attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx900" } + attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } + + !0 = !{i32 0, i32 1} + !1 = !{} + +... +--- +name: no_reorder_flat_load_local_store_local_load +alignment: 1 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +callsEHReturn: false +callsUnwindInit: false +hasEHCatchret: false +hasEHScopes: false +hasEHFunclets: false +isOutlined: false +debugInstrRef: false +failsVerification: false +tracksDebugUserValues: false +registers: + - { id: 0, class: vgpr_32, preferred-register: '' } + - { id: 1, class: sgpr_64, preferred-register: '' } + - { id: 2, class: sgpr_32, preferred-register: '' } + - { id: 3, class: sgpr_32, preferred-register: '' } + - { id: 4, class: sreg_32_xm0_xexec, preferred-register: '' } + - { id: 5, class: sreg_64_xexec, preferred-register: '' } + - { id: 6, class: vreg_128, preferred-register: '' } + - { id: 7, class: vreg_64, preferred-register: '' } + - { id: 8, class: vgpr_32, preferred-register: '' } + - { id: 9, class: vreg_64, preferred-register: '' } + - { id: 10, class: sreg_32, preferred-register: '' } + - { id: 11, class: sreg_32, preferred-register: '' } + - { id: 12, class: vgpr_32, preferred-register: '' } + - { id: 13, class: vgpr_32, preferred-register: '' } + - { id: 14, class: vgpr_32, preferred-register: '' } +liveins: + - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + functionContext: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: + explicitKernArgSize: 16 + maxKernArgAlign: 8 + ldsSize: 4 + gdsSize: 0 + dynLDSAlign: 1 + isEntryFunction: true + noSignedZerosFPMath: false + memoryBound: true + waveLimiter: true + hasSpilledSGPRs: false + hasSpilledVGPRs: false + scratchRSrcReg: '$private_rsrc_reg' + frameOffsetReg: '$fp_reg' + stackPtrOffsetReg: '$sp_reg' + bytesInStackArgArea: 0 + returnsVoid: true + argumentInfo: + kernargSegmentPtr: { reg: '$sgpr0_sgpr1' } + workGroupIDX: { reg: '$sgpr2' } + privateSegmentWaveByteOffset: { reg: '$sgpr3' } + workItemIDX: { reg: '$vgpr0' } + psInputAddr: 0 + psInputEnable: 0 + mode: + ieee: true + dx10-clamp: true + fp32-input-denormals: true + fp32-output-denormals: true + fp64-fp16-input-denormals: true + fp64-fp16-output-denormals: true + highBitsOf32BitAddress: 0 + occupancy: 8 + vgprForAGPRCopy: '' +body: | + bb.0 (%ir-block.0): + liveins: $sgpr0_sgpr1 + + %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 + %4:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 36, 0 :: (dereferenceable invariant load (s32) from %ir.out.kernarg.offset, addrspace 4) + %5:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1(p4), 44, 0 :: (dereferenceable invariant load (s64) from %ir.fptr.kernarg.offset, align 4, addrspace 4) + %7:vreg_64 = COPY %5 + %6:vreg_128 = FLAT_LOAD_DWORDX4 killed %7, 0, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %ir.fptr.load, align 8) + %8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + DS_WRITE_B128_gfx9 %8, killed %6, 512, 0, implicit $exec :: (store (s128) into `ptr addrspace(3) getelementptr (%llvm.amdgcn.kernel.no_reorder_flat_load_local_store_local_load.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.no_reorder_flat_load_local_store_local_load.lds, i32 128, i32 0)`, addrspace 3) + %9:vreg_64 = DS_READ2_B32_gfx9 %8, -127, -126, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) getelementptr (%llvm.amdgcn.kernel.no_reorder_flat_load_local_store_local_load.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.no_reorder_flat_load_local_store_local_load.lds, i32 129, i32 0)`, align 4, addrspace 3) + %10:sreg_32 = COPY %9.sub1 + %11:sreg_32 = COPY %9.sub0 + %12:vgpr_32 = COPY %4 + %13:vgpr_32 = COPY %11 + %14:vgpr_32 = COPY %10 + DS_WRITE2_B32_gfx9 killed %12, killed %13, killed %14, 0, 1, 0, implicit $exec :: (store (s64) into %ir.out.load, align 4, addrspace 3) + S_ENDPGM 0 + +...