Index: lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -141,6 +141,18 @@ } } +static bool memAccessesCanBeReordered( + MachineBasicBlock::iterator A, + MachineBasicBlock::iterator B, + const SIInstrInfo *TII, + llvm::AliasAnalysis * AA) { + return (TII->areMemAccessesTriviallyDisjoint(*A, *B, AA) || + // RAW or WAR - cannot reorder + // WAW - cannot reorder + // RAR - safe to reorder + !(A->mayStore() || B->mayStore())); +} + static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, ArrayRef InstsToMove, @@ -152,8 +164,8 @@ for (MachineInstr *InstToMove : InstsToMove) { if (!InstToMove->mayLoadOrStore()) continue; - if (!TII->areMemAccessesTriviallyDisjoint(MemOp, *InstToMove, AA)) - return false; + if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA)) + return false; } return true; } @@ -212,7 +224,7 @@ return E; if (MBBI->mayLoadOrStore() && - !TII->areMemAccessesTriviallyDisjoint(*I, *MBBI, AA)) { + !memAccessesCanBeReordered(*I, *MBBI, TII, AA)) { // We fail condition #1, but we may still be able to satisfy condition // #2. Add this instruction to the move list and then we will check // if condition #2 holds once we have selected the matching instruction. @@ -268,8 +280,10 @@ // We could potentially keep looking, but we'd need to make sure that // it was safe to move I and also all the instruction in InstsToMove // down past this instruction. - // FIXME: This is too conservative. - break; + if (!memAccessesCanBeReordered(*I, *MBBI, TII, AA) || // check if we can move I across MBBI + !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA) // check if we can move all I's users + ) + break; } return E; } Index: test/CodeGen/AMDGPU/ds_read2.ll =================================================================== --- test/CodeGen/AMDGPU/ds_read2.ll +++ test/CodeGen/AMDGPU/ds_read2.ll @@ -493,6 +493,51 @@ ret void } +; SI-LABEL: ds_read_diff_base_interleaving +; SI-NOT: ds_read_b32 +define void @ds_read_diff_base_interleaving(float addrspace(1)* nocapture, + [4 x [4 x float]] addrspace(3) *, + [4 x [4 x float]] addrspace(3) *, + [4 x [4 x float]] addrspace(3) *, + [4 x [4 x float]] addrspace(3) * +) { + + %st_addr = getelementptr float, float addrspace(1)* %0, i64 10 + %id_x = tail call i32 @llvm.amdgcn.workitem.id.x() #4 + %id_y = tail call i32 @llvm.amdgcn.workitem.id.y() #4 + + %6 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %1, i32 0, i32 %id_y, i32 0 + %7 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %2, i32 0, i32 0, i32 %id_x + %8 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %3, i32 0, i32 %id_y, i32 0 + %9 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %4, i32 0, i32 0, i32 %id_x + %10 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %1, i32 0, i32 %id_y, i32 1 + %11 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %2, i32 0, i32 1, i32 %id_x + %12 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %3, i32 0, i32 %id_y, i32 1 + %13 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %4, i32 0, i32 1, i32 %id_x + + + %14 = load float, float addrspace(3)* %6 + %15 = load float, float addrspace(3)* %7 + %mul3 = fmul float %14, %15 + %add1 = fadd float 2.0, %mul3 + %16 = load float, float addrspace(3)* %8 + %17 = load float, float addrspace(3)* %9 + %mul4 = fmul float %16, %17 + %sub2 = fsub float %add1, %mul4 + %18 = load float, float addrspace(3)* %10 + %19 = load float, float addrspace(3)* %11 + %mul5 = fmul float %18, %19 + %sub3 = fsub float %sub2, %mul5 + %20 = load float, float addrspace(3)* %12 + %21 = load float, float addrspace(3)* %13 + %mul6 = fmul float %20, %21 + %sub4 = fsub float %sub3, %mul6 + store float %sub4, float addrspace(1)* %st_addr + ret void +} + + + ; Function Attrs: nounwind readnone declare i32 @llvm.amdgcn.workgroup.id.x() #1