Index: llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -344,7 +344,8 @@ } if (MBBI->mayLoadOrStore() && - !memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA)) { + (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || + !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) { // We fail condition #1, but we may still be able to satisfy condition // #2. Add this instruction to the move list and then we will check // if condition #2 holds once we have selected the matching instruction. Index: llvm/trunk/test/CodeGen/AMDGPU/merge-load-store.mir =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/merge-load-store.mir +++ llvm/trunk/test/CodeGen/AMDGPU/merge-load-store.mir @@ -0,0 +1,70 @@ +# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s + +# Check that SILoadStoreOptimizer honors memory dependencies between moved +# instructions. +# +# The following IR snippet would usually be optimized by the peephole optimizer. +# However, an equivalent situation can occur with buffer instructions as well. + +# CHECK-LABEL: name: mem_dependency +# CHECK: DS_READ2_B32 %0, 0, 1, +# CHECK: DS_WRITE_B32 %0, killed %1, 64, +# CHECK: DS_READ2_B32 %0, 16, 17, +# CHECK: DS_WRITE_B32 killed %0, %5, 0 + +--- | + define amdgpu_kernel void @mem_dependency(i32 addrspace(3)* %ptr.0) nounwind { + %ptr.4 = getelementptr i32, i32 addrspace(3)* %ptr.0, i32 1 + %ptr.64 = getelementptr i32, i32 addrspace(3)* %ptr.0, i32 16 + %1 = load i32, i32 addrspace(3)* %ptr.0 + store i32 %1, i32 addrspace(3)* %ptr.64 + %2 = load i32, i32 addrspace(3)* %ptr.64 + %3 = load i32, i32 addrspace(3)* %ptr.4 + %4 = add i32 %2, %3 + store i32 %4, i32 addrspace(3)* %ptr.0 + ret void + } +... +--- +name: mem_dependency +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%vgpr0', virtual-reg: '%1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0: + liveins: %vgpr0 + + %1:vgpr_32 = COPY %vgpr0 + %m0 = S_MOV_B32 -1 + %2:vgpr_32 = DS_READ_B32 %1, 0, 0, implicit %m0, implicit %exec :: (load 4 from %ir.ptr.0) + DS_WRITE_B32 %1, killed %2, 64, 0, implicit %m0, implicit %exec :: (store 4 into %ir.ptr.64) + + ; Make this load unmergeable, to tempt SILoadStoreOptimizer into merging the + ; other two loads. + %6:vreg_64 = DS_READ2_B32 %1, 16, 17, 0, implicit %m0, implicit %exec :: (load 8 from %ir.ptr.64, align 4) + %3:vgpr_32 = COPY %6.sub0 + %4:vgpr_32 = DS_READ_B32 %1, 4, 0, implicit %m0, implicit %exec :: (load 4 from %ir.ptr.4) + %5:vgpr_32 = V_ADD_I32_e32 killed %3, killed %4, implicit-def %vcc, implicit %exec + DS_WRITE_B32 killed %1, %5, 0, 0, implicit killed %m0, implicit %exec :: (store 4 into %ir.ptr.0) + S_ENDPGM + +...