Index: llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -14,9 +14,11 @@ #include "AMDGPU.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/IR/InstVisitor.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/InitializePasses.h" #define DEBUG_TYPE "amdgpu-annotate-uniform" @@ -70,9 +72,61 @@ I->setMetadata("amdgpu.noclobber", MDNode::get(I->getContext(), {})); } -bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) { - const MemoryAccess *MA = MSSA->getWalker()->getClobberingMemoryAccess(Load); - return !MSSA->isLiveOnEntryDef(MA); +bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst *Load) { + MemorySSAWalker *Walker = MSSA->getSkipSelfWalker(); + SmallVector WorkList{Walker->getClobberingMemoryAccess(Load)}; + SmallSet Visited; + + LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n'); + + // Start with a nearest dominating clobbering access, it will be either + // live on entry (nothing to do, load is not clobbered), MemoryDef, or + // MemoryPhi if several MemoryDefs can define this memory state. In that + // case add all Defs to WorkList and continue going up and checking all + // the definitions of this memory location until the root. When all the + // defs are exhausted and came to the entry state we have no clobber. + // Along the scan ignore barriers and fences which are considered clobbers + // by the MemorySSA, but not really writing anything into the memory. + while (!WorkList.empty()) { + MemoryAccess *MA = WorkList.pop_back_val(); + if (!Visited.insert(MA).second) + continue; + + if (MSSA->isLiveOnEntryDef(MA)) + continue; + + if (MemoryDef *Def = dyn_cast(MA)) { + Instruction *DefInst = Def->getMemoryInst(); + + LLVM_DEBUG(dbgs() << " Def: " << *DefInst << '\n'); + + if (isa(DefInst)) { + WorkList.push_back(Def->getDefiningAccess()); + continue; + } + + if (const IntrinsicInst *II = dyn_cast(DefInst)) { + switch (II->getIntrinsicID()) { + case Intrinsic::amdgcn_s_barrier: + case Intrinsic::amdgcn_wave_barrier: + WorkList.push_back(Def->getDefiningAccess()); + continue; + default: + break; + } + } + + LLVM_DEBUG(dbgs() << " -> load is clobbered\n"); + return true; + } + + const MemoryPhi *Phi = cast(MA); + for (auto &Use : Phi->incoming_values()) + WorkList.push_back(cast(&Use)); + } + + LLVM_DEBUG(dbgs() << " -> no clobber\n"); + return false; } void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) { Index: llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll @@ -0,0 +1,187 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; Check that barrier or fence in between of loads is not considered a clobber +; for the purpose of converting vector loads into scalar. + +; GCN-LABEL: {{^}}simple_barrier: +; GCN: s_load_dword s +; GCN: s_waitcnt lgkmcnt(0) +; GCN: s_barrier +; GCN: s_waitcnt lgkmcnt(0) +; GCN: ; wave barrier +; GCN-NOT: global_load_dword +; GCN: s_load_dword s +; GCN-NOT: global_load_dword +; GCN: global_store_dword +define amdgpu_kernel void @simple_barrier(i32 addrspace(1)* %arg) { +bb: + %i = load i32, i32 addrspace(1)* %arg, align 4 + fence syncscope("workgroup") release + tail call void @llvm.amdgcn.s.barrier() + fence syncscope("workgroup") acquire + tail call void @llvm.amdgcn.wave.barrier() + %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 + %i2 = load i32, i32 addrspace(1)* %i1, align 4 + %i3 = add i32 %i2, %i + %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 + store i32 %i3, i32 addrspace(1)* %i4, align 4 + ret void +} + +; GCN-LABEL: {{^}}memory_phi_no_clobber: +; GCN: s_load_dword s +; GCN: s_waitcnt lgkmcnt(0) +; GCN: s_waitcnt lgkmcnt(0) +; GCN: s_barrier +; GCN-NOT: global_load_dword +; GCN: s_load_dword s +; GCN-NOT: global_load_dword +; GCN: global_store_dword +define amdgpu_kernel void @memory_phi_no_clobber(i32 addrspace(1)* %arg) { +bb: + %i = load i32, i32 addrspace(1)* %arg, align 4 + br i1 undef, label %if.then, label %if.else + +if.then: + tail call void @llvm.amdgcn.s.barrier() + br label %if.end + +if.else: + fence syncscope("workgroup") release + br label %if.end + +if.end: + %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 + %i2 = load i32, i32 addrspace(1)* %i1, align 4 + %i3 = add i32 %i2, %i + %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 + store i32 %i3, i32 addrspace(1)* %i4, align 4 + ret void +} + +; GCN-LABEL: {{^}}memory_phi_clobber1: +; GCN: s_load_dword s +; GCN: s_barrier +; GCN: global_store_dword +; GCN: global_load_dword +; GCN: global_store_dword +define amdgpu_kernel void @memory_phi_clobber1(i32 addrspace(1)* %arg) { +bb: + %i = load i32, i32 addrspace(1)* %arg, align 4 + br i1 undef, label %if.then, label %if.else + +if.then: + %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3 + store i32 1, i32 addrspace(1)* %gep, align 4 + br label %if.end + +if.else: + tail call void @llvm.amdgcn.s.barrier() + br label %if.end + +if.end: + %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 + %i2 = load i32, i32 addrspace(1)* %i1, align 4 + %i3 = add i32 %i2, %i + %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 + store i32 %i3, i32 addrspace(1)* %i4, align 4 + ret void +} + +; GCN-LABEL: {{^}}memory_phi_clobber2: +; GCN-DAG: s_load_dword s +; GCN-DAG: global_store_dword +; GCN: s_barrier +; GCN: global_load_dword +; GCN: global_store_dword +define amdgpu_kernel void @memory_phi_clobber2(i32 addrspace(1)* %arg) { +bb: + %i = load i32, i32 addrspace(1)* %arg, align 4 + br i1 undef, label %if.then, label %if.else + +if.then: + tail call void @llvm.amdgcn.s.barrier() + br label %if.end + +if.else: + %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3 + store i32 1, i32 addrspace(1)* %gep, align 4 + br label %if.end + +if.end: + %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 + %i2 = load i32, i32 addrspace(1)* %i1, align 4 + %i3 = add i32 %i2, %i + %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 + store i32 %i3, i32 addrspace(1)* %i4, align 4 + ret void +} + +; GCN-LABEL: {{^}}clobbering_loop: +; GCN: s_load_dword s +; GCN: global_load_dword +; GCN: global_store_dword +define amdgpu_kernel void @clobbering_loop(i32 addrspace(1)* %arg, i1 %cc) { +bb: + %i = load i32, i32 addrspace(1)* %arg, align 4 + br label %while.cond + +while.cond: + %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 + %i2 = load i32, i32 addrspace(1)* %i1, align 4 + %i3 = add i32 %i2, %i + %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 + store i32 %i3, i32 addrspace(1)* %i4, align 4 + tail call void @llvm.amdgcn.wave.barrier() + br i1 %cc, label %while.cond, label %end + +end: + ret void +} + +; GCN-LABEL: {{^}}no_clobbering_loop: +; GCN: s_load_dword s +; GCN: s_load_dword s +; GCN-NOT: global_load_dword +; GCN: global_store_dword +define amdgpu_kernel void @no_clobbering_loop(i32 addrspace(1)* noalias %arg, i32 addrspace(1)* noalias %out, i32 %n) { +bb: + %i = load i32, i32 addrspace(1)* %arg, align 4 + br label %while.cond + +while.cond: + %c = phi i32 [ 0, %bb ], [ %inc, %while.cond ] + %acc = phi i32 [ %i, %bb ], [ %i3, %while.cond ] + %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %c + %i2 = load i32, i32 addrspace(1)* %i1, align 4 + %i3 = add i32 %i2, %acc + tail call void @llvm.amdgcn.wave.barrier() + %inc = add nuw nsw i32 %c, 1 + %cc = icmp eq i32 %inc, %n + br i1 %cc, label %while.cond, label %end + +end: + store i32 %i3, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}clobber_by_atomic_load: +; GCN: s_load_dword s +; GCN: global_load_dword {{.*}} glc +; GCN: global_load_dword +; GCN: global_store_dword +define amdgpu_kernel void @clobber_by_atomic_load(i32 addrspace(1)* %arg) { +bb: + %i = load i32, i32 addrspace(1)* %arg, align 4 + %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 + %val = load atomic i32, i32 addrspace(1)* %gep seq_cst, align 4 + %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3 + %i2 = load i32, i32 addrspace(1)* %i1, align 4 + %i3 = add i32 %i2, %i + %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 4 + store i32 %i3, i32 addrspace(1)* %i4, align 4 + ret void +} + +declare void @llvm.amdgcn.s.barrier() +declare void @llvm.amdgcn.wave.barrier()