Index: llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -42,7 +42,9 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/InitializePasses.h" #include "llvm/IR/DebugLoc.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" @@ -372,6 +374,8 @@ AMDGPU::IsaVersion IV; DenseSet TrackedWaitcntSet; + DenseMap SLoadAddresses; + MachinePostDominatorTree *PDT; struct BlockInfo { MachineBasicBlock *MBB; @@ -406,6 +410,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -792,6 +797,7 @@ INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) @@ -1012,6 +1018,13 @@ if (MI.mayStore()) { // FIXME: Should not be relying on memoperands. for (const MachineMemOperand *Memop : MI.memoperands()) { + const Value *Ptr = Memop->getValue(); + if (SLoadAddresses.count(Ptr)) { + addWait(Wait, LGKM_CNT, 0); + if (PDT->dominates(MI.getParent(), + SLoadAddresses.find(Ptr)->second)) + SLoadAddresses.erase(Ptr); + } unsigned AS = Memop->getAddrSpace(); if (AS != AMDGPUAS::LOCAL_ADDRESS) continue; @@ -1399,6 +1412,13 @@ } } + if (TII->isSMRD(Inst)) { + for (const MachineMemOperand *Memop : Inst.memoperands()) { + const Value *Ptr = Memop->getValue(); + SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent())); + } + } + // Generate an s_waitcnt instruction to be placed before // cur_Inst, if needed. Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr); @@ -1448,6 +1468,7 @@ MRI = &MF.getRegInfo(); IV = AMDGPU::getIsaVersion(ST->getCPU()); const SIMachineFunctionInfo *MFI = MF.getInfo(); + PDT = &getAnalysis(); ForceEmitZeroWaitcnts = ForceEmitZeroFlag; for (auto T : inst_counter_types()) Index: llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll @@ -0,0 +1,26 @@ +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s + +; CHECK-LABEL: BB0_1 +; CHECK: s_load_dword s{{[0-9]+}}, s{{\[}}[[ADDR_LO:[0-9]+]]{{\:}}[[ADDR_HI:[0-9]+]]{{\]}}, 0x0 +; CHECK: s_waitcnt lgkmcnt(0) +; CHECK: global_store_dword v{{\[}}[[ADDR_LO]]{{\:}}[[ADDR_HI]]{{\]}}, v{{[0-9]+}}, off + +define amdgpu_kernel void @smrd_vmem_war(i32 addrspace(1)* nocapture %0, i64 addrspace(1)* nocapture %1) { + %3 = call i32 @llvm.amdgcn.workitem.id.x() + %4 = icmp eq i32 %3, 0 + br i1 %4, label %5, label %10 + +5: ; preds = %2 + %6 = load i32, i32 addrspace(1)* %0, align 4 + store i32 0, i32 addrspace(1)* %0, align 4 + %7 = zext i32 %6 to i64 + %8 = load i64, i64 addrspace(1)* %1, align 8 + %9 = add i64 %8, %7 + store i64 %9, i64 addrspace(1)* %1, align 8 + br label %10 + +10: ; preds = %5, %2 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x()