Index: llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -141,6 +141,27 @@ } } +// Add MI and its defs to the lists if MI reads one of the defs that are +// already in the list. Returns true in that case. +static bool +addToListsIfDependent(MachineInstr &MI, + SmallVectorImpl &Defs, + SmallVectorImpl &Insts) { + for (const MachineOperand *Def : Defs) { + bool ReadDef = MI.readsVirtualRegister(Def->getReg()); + // If ReadDef is true, then there is a use of Def between I + // and the instruction that I will potentially be merged with. We + // will need to move this instruction after the merged instructions. + if (ReadDef) { + Insts.push_back(&MI); + addDefsToList(MI, Defs); + return true; + } + } + + return false; +} + static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, ArrayRef InstsToMove, @@ -224,17 +245,7 @@ // When we match I with another DS instruction we will be moving I down // to the location of the matched instruction any uses of I will need to // be moved down as well. - for (const MachineOperand *Def : DefsToMove) { - bool ReadDef = MBBI->readsVirtualRegister(Def->getReg()); - // If ReadDef is true, then there is a use of Def between I - // and the instruction that I will potentially be merged with. We - // will need to move this instruction after the merged instructions. - if (ReadDef) { - InstsToMove.push_back(&*MBBI); - addDefsToList(*MBBI, DefsToMove); - break; - } - } + addToListsIfDependent(*MBBI, DefsToMove, InstsToMove); continue; } @@ -242,6 +253,15 @@ if (MBBI->hasOrderedMemoryRef()) return E; + // Handle a case like + // DS_WRITE_B32 addr, v, idx0 + // w = DS_READ_B32 addr, idx0 + // DS_WRITE_B32 addr, f(w), idx1 + // where the DS_READ_B32 ends up in InstsToMove and therefore prevents + // merging of the two writes. + if (addToListsIfDependent(*MBBI, DefsToMove, InstsToMove)) + continue; + int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr); const MachineOperand &AddrReg0 = I->getOperand(AddrIdx); const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx); Index: llvm/trunk/test/CodeGen/AMDGPU/merge-store-usedef.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/merge-store-usedef.ll +++ llvm/trunk/test/CodeGen/AMDGPU/merge-store-usedef.ll @@ -0,0 +1,23 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s + +; CHECK-LABEL: {{^}}test1: +; CHECK: ds_write_b32 +; CHECK: ds_read_b32 +; CHECK: ds_write_b32 +define amdgpu_vs void @test1(i32 %v) #0 { + %p0 = getelementptr i32, i32 addrspace(3)* null, i32 0 + %p1 = getelementptr i32, i32 addrspace(3)* null, i32 1 + + store i32 %v, i32 addrspace(3)* %p0 + + call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %v, i32 1, i32 undef, i32 undef, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + + %w = load i32, i32 addrspace(3)* %p0 + store i32 %w, i32 addrspace(3)* %p1 + ret void +} + +declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0 + +attributes #0 = { nounwind }