Index: llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -265,8 +265,11 @@ SmallPtrSet &Promoted) const; void addInstToMergeableList(const CombineInfo &CI, std::list > &MergeableInsts) const; - bool collectMergeableInsts(MachineBasicBlock &MBB, - std::list > &MergeableInsts) const; + + std::pair collectMergeableInsts( + MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, + MemInfoMap &Visited, SmallPtrSet &AnchorList, + std::list> &MergeableInsts) const; public: static char ID; @@ -1944,31 +1947,38 @@ MergeableInsts.emplace_back(1, CI); } -bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock &MBB, - std::list > &MergeableInsts) const { +std::pair +SILoadStoreOptimizer::collectMergeableInsts( + MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, + MemInfoMap &Visited, SmallPtrSet &AnchorList, + std::list> &MergeableInsts) const { bool Modified = false; - // Contain the list - MemInfoMap Visited; - // Contains the list of instructions for which constant offsets are being - // promoted to the IMM. - SmallPtrSet AnchorList; // Sort potential mergeable instructions into lists. One list per base address. unsigned Order = 0; - for (MachineInstr &MI : MBB.instrs()) { + MachineBasicBlock::iterator BlockI = Begin; + for (; BlockI != End; ++BlockI) { + MachineInstr &MI = *BlockI; + // We run this before checking if an address is mergeable, because it can produce // better code even if the instructions aren't mergeable. if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) Modified = true; + // Don't combine if volatile. We also won't be able to merge across this, so + // break the search. We can look after this barrier for separate merges. + if (MI.hasOrderedMemoryRef()) { + LLVM_DEBUG(dbgs() << "Breaking search on memory fence: " << MI); + + // Search will resume after this instruction in a separate merge list. + ++BlockI; + break; + } + const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); if (InstClass == UNKNOWN) continue; - // Don't combine if volatile. - if (MI.hasOrderedMemoryRef()) - continue; - CombineInfo CI; CI.setMI(MI, *TII, *STM); CI.Order = Order++; @@ -2008,7 +2018,7 @@ ++I; } - return Modified; + return std::make_pair(BlockI, Modified); } // Scan through looking for adjacent LDS operations with constant offsets from @@ -2161,15 +2171,33 @@ bool Modified = false; + // Contains the list of instructions for which constant offsets are being + // promoted to the IMM. This is tracked for an entire block at time. + SmallPtrSet AnchorList; + MemInfoMap Visited; for (MachineBasicBlock &MBB : MF) { - std::list > MergeableInsts; - // First pass: Collect list of all instructions we know how to merge. - Modified |= collectMergeableInsts(MBB, MergeableInsts); - do { - OptimizeAgain = false; - Modified |= optimizeBlock(MergeableInsts); - } while (OptimizeAgain); + MachineBasicBlock::iterator SectionEnd; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; + I = SectionEnd) { + bool CollectModified; + std::list> MergeableInsts; + + // First pass: Collect list of all instructions we know how to merge in a + // subset of the block. + std::tie(SectionEnd, CollectModified) = + collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); + + Modified |= CollectModified; + + do { + OptimizeAgain = false; + Modified |= optimizeBlock(MergeableInsts); + } while (OptimizeAgain); + } + + Visited.clear(); + AnchorList.clear(); } return Modified; Index: llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll @@ -0,0 +1,72 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +@lds = internal addrspace(3) global [576 x double] undef, align 16 + +; Stores to the same address appear multiple places in the same +; block. When sorted by offset, the merges would fail. We should form +; two groupings of ds_write2_b64 on either side of the fence. +define amdgpu_kernel void @same_address_fence_merge_write2() #0 { +; GCN-LABEL: same_address_fence_merge_write2: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GCN-NEXT: s_mov_b32 s1, 0x40100000 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_add_u32_e32 v3, 0x840, v2 +; GCN-NEXT: v_add_u32_e32 v4, 0xc60, v2 +; GCN-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset1:66 +; GCN-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset0:132 offset1:198 +; GCN-NEXT: ds_write2_b64 v3, v[0:1], v[0:1] offset1:66 +; GCN-NEXT: ds_write2_b64 v4, v[0:1], v[0:1] offset1:66 +; GCN-NEXT: s_mov_b32 s1, 0x3ff00000 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_barrier +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset1:66 +; GCN-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset0:132 offset1:198 +; GCN-NEXT: ds_write2_b64 v3, v[0:1], v[0:1] offset1:66 +; GCN-NEXT: ds_write2_b64 v4, v[0:1], v[0:1] offset1:66 +; GCN-NEXT: s_endpgm +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0 + %tmp1 = getelementptr inbounds [576 x double], [576 x double] addrspace(3)* @lds, i32 0, i32 %tmp + store double 4.000000e+00, double addrspace(3)* %tmp1, align 8 + %tmp2 = getelementptr inbounds double, double addrspace(3)* %tmp1, i32 66 + store double 4.000000e+00, double addrspace(3)* %tmp2, align 8 + %tmp3 = getelementptr inbounds double, double addrspace(3)* %tmp1, i32 132 + store double 4.000000e+00, double addrspace(3)* %tmp3, align 8 + %tmp4 = getelementptr inbounds double, double addrspace(3)* %tmp1, i32 198 + store double 4.000000e+00, double addrspace(3)* %tmp4, align 8 + %tmp5 = getelementptr inbounds double, double addrspace(3)* %tmp1, i32 264 + store double 4.000000e+00, double addrspace(3)* %tmp5, align 8 + %tmp6 = getelementptr inbounds double, double addrspace(3)* %tmp1, i32 330 + store double 4.000000e+00, double addrspace(3)* %tmp6, align 8 + %tmp7 = getelementptr inbounds double, double addrspace(3)* %tmp1, i32 396 + store double 4.000000e+00, double addrspace(3)* %tmp7, align 8 + %tmp8 = getelementptr inbounds double, double addrspace(3)* %tmp1, i32 462 + store double 4.000000e+00, double addrspace(3)* %tmp8, align 8 + fence syncscope("workgroup") release + tail call void @llvm.amdgcn.s.barrier() + fence syncscope("workgroup") acquire + store double 1.000000e+00, double addrspace(3)* %tmp1, align 8 + store double 1.000000e+00, double addrspace(3)* %tmp2, align 8 + store double 1.000000e+00, double addrspace(3)* %tmp3, align 8 + store double 1.000000e+00, double addrspace(3)* %tmp4, align 8 + store double 1.000000e+00, double addrspace(3)* %tmp5, align 8 + store double 1.000000e+00, double addrspace(3)* %tmp6, align 8 + store double 1.000000e+00, double addrspace(3)* %tmp7, align 8 + store double 1.000000e+00, double addrspace(3)* %tmp8, align 8 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare void @llvm.amdgcn.s.barrier() #1 + +attributes #0 = { nounwind readnone speculatable } +attributes #1 = { convergent nounwind } + +!0 = !{i32 0, i32 1024}