diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -14,9 +14,11 @@ #include "AMDGPU.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/IR/InstVisitor.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/InitializePasses.h" #define DEBUG_TYPE "amdgpu-annotate-uniform" @@ -70,9 +72,68 @@ I->setMetadata("amdgpu.noclobber", MDNode::get(I->getContext(), {})); } -bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) { - const MemoryAccess *MA = MSSA->getWalker()->getClobberingMemoryAccess(Load); - return !MSSA->isLiveOnEntryDef(MA); +bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst *Load) { + MemorySSAWalker *Walker = MSSA->getWalker(); + SmallVector WorkList{Walker->getClobberingMemoryAccess(Load)}; + SmallSet Visited; + MemoryLocation Loc(MemoryLocation::get(Load)); + + const auto isReallyAClobber = [](MemoryDef *Def) -> bool { + Instruction *DefInst = Def->getMemoryInst(); + LLVM_DEBUG(dbgs() << " Def: " << *DefInst << '\n'); + + if (isa(DefInst)) + return false; + + if (const IntrinsicInst *II = dyn_cast(DefInst)) { + switch (II->getIntrinsicID()) { + case Intrinsic::amdgcn_s_barrier: + case Intrinsic::amdgcn_wave_barrier: + return false; + default: + break; + } + } + + return true; + }; + + LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n'); + + // Start with a nearest dominating clobbering access, it will be either + // live on entry (nothing to do, load is not clobbered), MemoryDef, or + // MemoryPhi if several MemoryDefs can define this memory state. In that + // case add all Defs to WorkList and continue going up and checking all + // the definitions of this memory location until the root. When all the + // defs are exhausted and came to the entry state we have no clobber. + // Along the scan ignore barriers and fences which are considered clobbers + // by the MemorySSA, but not really writing anything into the memory. + while (!WorkList.empty()) { + MemoryAccess *MA = WorkList.pop_back_val(); + if (!Visited.insert(MA).second) + continue; + + if (MSSA->isLiveOnEntryDef(MA)) + continue; + + if (MemoryDef *Def = dyn_cast(MA)) { + if (isReallyAClobber(Def)) { + LLVM_DEBUG(dbgs() << " -> load is clobbered\n"); + return true; + } + + WorkList.push_back( + Walker->getClobberingMemoryAccess(Def->getDefiningAccess(), Loc)); + continue; + } + + const MemoryPhi *Phi = cast(MA); + for (auto &Use : Phi->incoming_values()) + WorkList.push_back(cast(&Use)); + } + + LLVM_DEBUG(dbgs() << " -> no clobber\n"); + return false; } void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) { diff --git a/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll b/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll @@ -0,0 +1,438 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -march=amdgcn -mcpu=gfx900 -amdgpu-aa -amdgpu-aa-wrapper -amdgpu-annotate-uniform -S < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; Check that barrier or fence in between of loads is not considered a clobber +; for the purpose of converting vector loads into scalar. + +@LDS = linkonce_odr hidden local_unnamed_addr addrspace(3) global i32 undef + +; GCN-LABEL: {{^}}simple_barrier: +; GCN: s_load_dword s +; GCN: s_waitcnt lgkmcnt(0) +; GCN: s_barrier +; GCN: s_waitcnt lgkmcnt(0) +; GCN: ; wave barrier +; GCN-NOT: global_load_dword +; GCN: s_load_dword s +; GCN-NOT: global_load_dword +; GCN: global_store_dword +define amdgpu_kernel void @simple_barrier(i32 addrspace(1)* %arg) { +; CHECK-LABEL: @simple_barrier( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[ARG:%.*]], i64 0, !amdgpu.uniform !0, !amdgpu.noclobber !0 +; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4 +; CHECK-NEXT: fence syncscope("workgroup") release +; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: fence syncscope("workgroup") acquire +; CHECK-NEXT: tail call void @llvm.amdgcn.wave.barrier() +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 1, !amdgpu.uniform !0, !amdgpu.noclobber !0 +; CHECK-NEXT: [[I2:%.*]] = load i32, i32 addrspace(1)* [[I1]], align 4 +; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] +; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 2 +; CHECK-NEXT: store i32 [[I3]], i32 addrspace(1)* [[I4]], align 4 +; CHECK-NEXT: ret void +; +bb: + %i = load i32, i32 addrspace(1)* %arg, align 4 + fence syncscope("workgroup") release + tail call void @llvm.amdgcn.s.barrier() + fence syncscope("workgroup") acquire + tail call void @llvm.amdgcn.wave.barrier() + %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 + %i2 = load i32, i32 addrspace(1)* %i1, align 4 + %i3 = add i32 %i2, %i + %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 + store i32 %i3, i32 addrspace(1)* %i4, align 4 + ret void +} + +; GCN-LABEL: {{^}}memory_phi_no_clobber: +; GCN: s_load_dword s +; GCN: s_waitcnt lgkmcnt(0) +; GCN: s_waitcnt lgkmcnt(0) +; GCN: s_barrier +; GCN-NOT: global_load_dword +; GCN: s_load_dword s +; GCN-NOT: global_load_dword +; GCN: global_store_dword +define amdgpu_kernel void @memory_phi_no_clobber(i32 addrspace(1)* %arg) { +; CHECK-LABEL: @memory_phi_no_clobber( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[ARG:%.*]], i64 0, !amdgpu.uniform !0, !amdgpu.noclobber !0 +; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4 +; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0 +; CHECK: if.then: +; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: br label [[IF_END:%.*]], !amdgpu.uniform !0 +; CHECK: if.else: +; CHECK-NEXT: fence syncscope("workgroup") release +; CHECK-NEXT: br label [[IF_END]], !amdgpu.uniform !0 +; CHECK: if.end: +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 1, !amdgpu.uniform !0, !amdgpu.noclobber !0 +; CHECK-NEXT: [[I2:%.*]] = load i32, i32 addrspace(1)* [[I1]], align 4 +; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] +; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 2 +; CHECK-NEXT: store i32 [[I3]], i32 addrspace(1)* [[I4]], align 4 +; CHECK-NEXT: ret void +; +bb: + %i = load i32, i32 addrspace(1)* %arg, align 4 + br i1 undef, label %if.then, label %if.else + +if.then: + tail call void @llvm.amdgcn.s.barrier() + br label %if.end + +if.else: + fence syncscope("workgroup") release + br label %if.end + +if.end: + %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 + %i2 = load i32, i32 addrspace(1)* %i1, align 4 + %i3 = add i32 %i2, %i + %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 + store i32 %i3, i32 addrspace(1)* %i4, align 4 + ret void +} + +; GCN-LABEL: {{^}}memory_phi_clobber1: +; GCN: s_load_dword s +; GCN: s_barrier +; GCN: global_store_dword +; GCN: global_load_dword +; GCN: global_store_dword +define amdgpu_kernel void @memory_phi_clobber1(i32 addrspace(1)* %arg) { +; CHECK-LABEL: @memory_phi_clobber1( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[ARG:%.*]], i64 0, !amdgpu.uniform !0, !amdgpu.noclobber !0 +; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4 +; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0 +; CHECK: if.then: +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 3 +; CHECK-NEXT: store i32 1, i32 addrspace(1)* [[GEP]], align 4 +; CHECK-NEXT: br label [[IF_END:%.*]], !amdgpu.uniform !0 +; CHECK: if.else: +; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: br label [[IF_END]], !amdgpu.uniform !0 +; CHECK: if.end: +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 1, !amdgpu.uniform !0 +; CHECK-NEXT: [[I2:%.*]] = load i32, i32 addrspace(1)* [[I1]], align 4 +; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] +; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 2 +; CHECK-NEXT: store i32 [[I3]], i32 addrspace(1)* [[I4]], align 4 +; CHECK-NEXT: ret void +; +bb: + %i = load i32, i32 addrspace(1)* %arg, align 4 + br i1 undef, label %if.then, label %if.else + +if.then: + %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3 + store i32 1, i32 addrspace(1)* %gep, align 4 + br label %if.end + +if.else: + tail call void @llvm.amdgcn.s.barrier() + br label %if.end + +if.end: + %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 + %i2 = load i32, i32 addrspace(1)* %i1, align 4 + %i3 = add i32 %i2, %i + %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 + store i32 %i3, i32 addrspace(1)* %i4, align 4 + ret void +} + +; GCN-LABEL: {{^}}memory_phi_clobber2: +; GCN-DAG: s_load_dword s +; GCN-DAG: global_store_dword +; GCN: s_barrier +; GCN: global_load_dword +; GCN: global_store_dword +define amdgpu_kernel void @memory_phi_clobber2(i32 addrspace(1)* %arg) { +; CHECK-LABEL: @memory_phi_clobber2( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[ARG:%.*]], i64 0, !amdgpu.uniform !0, !amdgpu.noclobber !0 +; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4 +; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0 +; CHECK: if.then: +; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: br label [[IF_END:%.*]], !amdgpu.uniform !0 +; CHECK: if.else: +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 3 +; CHECK-NEXT: store i32 1, i32 addrspace(1)* [[GEP]], align 4 +; CHECK-NEXT: br label [[IF_END]], !amdgpu.uniform !0 +; CHECK: if.end: +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 1, !amdgpu.uniform !0 +; CHECK-NEXT: [[I2:%.*]] = load i32, i32 addrspace(1)* [[I1]], align 4 +; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] +; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 2 +; CHECK-NEXT: store i32 [[I3]], i32 addrspace(1)* [[I4]], align 4 +; CHECK-NEXT: ret void +; +bb: + %i = load i32, i32 addrspace(1)* %arg, align 4 + br i1 undef, label %if.then, label %if.else + +if.then: + tail call void @llvm.amdgcn.s.barrier() + br label %if.end + +if.else: + %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3 + store i32 1, i32 addrspace(1)* %gep, align 4 + br label %if.end + +if.end: + %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 + %i2 = load i32, i32 addrspace(1)* %i1, align 4 + %i3 = add i32 %i2, %i + %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 + store i32 %i3, i32 addrspace(1)* %i4, align 4 + ret void +} + +; GCN-LABEL: {{^}}no_clobbering_loop1: +; GCN: s_load_dword s +; GCN: s_load_dword s +; GCN-NOT: global_load_dword +; GCN: global_store_dword +define amdgpu_kernel void @no_clobbering_loop1(i32 addrspace(1)* %arg, i1 %cc) { +; CHECK-LABEL: @no_clobbering_loop1( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[ARG:%.*]], i64 0, !amdgpu.uniform !0, !amdgpu.noclobber !0 +; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4 +; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform !0 +; CHECK: while.cond: +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 1, !amdgpu.uniform !0, !amdgpu.noclobber !0 +; CHECK-NEXT: [[I2:%.*]] = load i32, i32 addrspace(1)* [[I1]], align 4 +; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] +; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 2 +; CHECK-NEXT: store i32 [[I3]], i32 addrspace(1)* [[I4]], align 4 +; CHECK-NEXT: tail call void @llvm.amdgcn.wave.barrier() +; CHECK-NEXT: br i1 [[CC:%.*]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform !0 +; CHECK: end: +; CHECK-NEXT: ret void +; +bb: + %i = load i32, i32 addrspace(1)* %arg, align 4 + br label %while.cond + +while.cond: + %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 + %i2 = load i32, i32 addrspace(1)* %i1, align 4 + %i3 = add i32 %i2, %i + %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 + store i32 %i3, i32 addrspace(1)* %i4, align 4 + tail call void @llvm.amdgcn.wave.barrier() + br i1 %cc, label %while.cond, label %end + +end: + ret void +} + +; GCN-LABEL: {{^}}no_clobbering_loop2: +; GCN: s_load_dword s +; GCN: s_load_dword s +; GCN-NOT: global_load_dword +; GCN: global_store_dword +define amdgpu_kernel void @no_clobbering_loop2(i32 addrspace(1)* noalias %arg, i32 addrspace(1)* noalias %out, i32 %n) { +; CHECK-LABEL: @no_clobbering_loop2( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[ARG:%.*]], i64 0, !amdgpu.uniform !0, !amdgpu.noclobber !0 +; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4 +; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform !0 +; CHECK: while.cond: +; CHECK-NEXT: [[C:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[INC:%.*]], [[WHILE_COND]] ] +; CHECK-NEXT: [[ACC:%.*]] = phi i32 [ [[I]], [[BB]] ], [ [[I3:%.*]], [[WHILE_COND]] ] +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i32 [[C]], !amdgpu.uniform !0, !amdgpu.noclobber !0 +; CHECK-NEXT: [[I2:%.*]] = load i32, i32 addrspace(1)* [[I1]], align 4 +; CHECK-NEXT: [[I3]] = add i32 [[I2]], [[ACC]] +; CHECK-NEXT: tail call void @llvm.amdgcn.wave.barrier() +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[C]], 1 +; CHECK-NEXT: [[CC:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] +; CHECK-NEXT: br i1 [[CC]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform !0 +; CHECK: end: +; CHECK-NEXT: store i32 [[I3]], i32 addrspace(1)* [[OUT:%.*]], align 4 +; CHECK-NEXT: ret void +; +bb: + %i = load i32, i32 addrspace(1)* %arg, align 4 + br label %while.cond + +while.cond: + %c = phi i32 [ 0, %bb ], [ %inc, %while.cond ] + %acc = phi i32 [ %i, %bb ], [ %i3, %while.cond ] + %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %c + %i2 = load i32, i32 addrspace(1)* %i1, align 4 + %i3 = add i32 %i2, %acc + tail call void @llvm.amdgcn.wave.barrier() + %inc = add nuw nsw i32 %c, 1 + %cc = icmp eq i32 %inc, %n + br i1 %cc, label %while.cond, label %end + +end: + store i32 %i3, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}clobbering_loop: +; GCN: s_load_dword s +; GCN: global_load_dword +; GCN: global_store_dword +define amdgpu_kernel void @clobbering_loop(i32 addrspace(1)* %arg, i32 addrspace(1)* %out, i1 %cc) { +; CHECK-LABEL: @clobbering_loop( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[ARG:%.*]], i64 0, !amdgpu.uniform !0, !amdgpu.noclobber !0 +; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4 +; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform !0 +; CHECK: while.cond: +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 1, !amdgpu.uniform !0 +; CHECK-NEXT: [[I2:%.*]] = load i32, i32 addrspace(1)* [[I1]], align 4 +; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] +; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[OUT:%.*]], i64 1 +; CHECK-NEXT: store i32 [[I3]], i32 addrspace(1)* [[I4]], align 4 +; CHECK-NEXT: tail call void @llvm.amdgcn.wave.barrier() +; CHECK-NEXT: br i1 [[CC:%.*]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform !0 +; CHECK: end: +; CHECK-NEXT: ret void +; +bb: + %i = load i32, i32 addrspace(1)* %arg, align 4 + br label %while.cond + +while.cond: + %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 + %i2 = load i32, i32 addrspace(1)* %i1, align 4 + %i3 = add i32 %i2, %i + %i4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 + store i32 %i3, i32 addrspace(1)* %i4, align 4 + tail call void @llvm.amdgcn.wave.barrier() + br i1 %cc, label %while.cond, label %end + +end: + ret void +} + +; GCN-LABEL: {{^}}clobber_by_atomic_load: +; GCN: s_load_dword s +; GCN: global_load_dword {{.*}} glc +; GCN: global_load_dword +; GCN: global_store_dword +define amdgpu_kernel void @clobber_by_atomic_load(i32 addrspace(1)* %arg) { +; CHECK-LABEL: @clobber_by_atomic_load( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[ARG:%.*]], i64 0, !amdgpu.uniform !0, !amdgpu.noclobber !0 +; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 2, !amdgpu.uniform !0, !amdgpu.noclobber !0 +; CHECK-NEXT: [[VAL:%.*]] = load atomic i32, i32 addrspace(1)* [[GEP]] seq_cst, align 4 +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 3, !amdgpu.uniform !0 +; CHECK-NEXT: [[I2:%.*]] = load i32, i32 addrspace(1)* [[I1]], align 4 +; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]] +; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 4 +; CHECK-NEXT: store i32 [[I3]], i32 addrspace(1)* [[I4]], align 4 +; CHECK-NEXT: ret void +; +bb: + %i = load i32, i32 addrspace(1)* %arg, align 4 + %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 + %val = load atomic i32, i32 addrspace(1)* %gep seq_cst, align 4 + %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3 + %i2 = load i32, i32 addrspace(1)* %i1, align 4 + %i3 = add i32 %i2, %i + %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 4 + store i32 %i3, i32 addrspace(1)* %i4, align 4 + ret void +} + +; GCN-LABEL: {{^}}no_alias_store: +; GCN: ds_write_b32 +; GCN: s_barrier +; GCN: s_load_dword s +; GCN-NOT: global_load_dword +; GCN: global_store_dword +define protected amdgpu_kernel void @no_alias_store(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +; CHECK-LABEL: @no_alias_store( +; CHECK-NEXT: entry: +; CHECK-NEXT: store i32 0, i32 addrspace(3)* @LDS, align 4 +; CHECK-NEXT: fence syncscope("workgroup") release +; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: fence syncscope("workgroup") acquire +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[IN:%.*]], i64 0, !amdgpu.uniform !0, !amdgpu.noclobber !0 +; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[GEP]], align 4 +; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4 +; CHECK-NEXT: ret void +; +entry: + store i32 0, i32 addrspace(3)* @LDS, align 4 + fence syncscope("workgroup") release + tail call void @llvm.amdgcn.s.barrier() + fence syncscope("workgroup") acquire + %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 0 + %ld = load i32, i32 addrspace(1)* %gep, align 4 + store i32 %ld, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}may_alias_store: +; GCN: global_store_dword +; GCN: s_barrier +; GCN: global_load_dword +; GCN: global_store_dword +define protected amdgpu_kernel void @may_alias_store(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +; CHECK-LABEL: @may_alias_store( +; CHECK-NEXT: entry: +; CHECK-NEXT: store i32 0, i32 addrspace(1)* [[OUT:%.*]], align 4 +; CHECK-NEXT: fence syncscope("workgroup") release +; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: fence syncscope("workgroup") acquire +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[IN:%.*]], i64 0, !amdgpu.uniform !0 +; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[GEP]], align 4 +; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT]], align 4 +; CHECK-NEXT: ret void +; +entry: + store i32 0, i32 addrspace(1)* %out, align 4 + fence syncscope("workgroup") release + tail call void @llvm.amdgcn.s.barrier() + fence syncscope("workgroup") acquire + %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 0 + %ld = load i32, i32 addrspace(1)* %gep, align 4 + store i32 %ld, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}no_alias_volatile_store: +; GCN: ds_write_b32 +; GCN: s_barrier +; GCN: s_load_dword s +; GCN-NOT: global_load_dword +; GCN: global_store_dword +define protected amdgpu_kernel void @no_alias_volatile_store(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +; CHECK-LABEL: @no_alias_volatile_store( +; CHECK-NEXT: entry: +; CHECK-NEXT: store volatile i32 0, i32 addrspace(3)* @LDS, align 4 +; CHECK-NEXT: fence syncscope("workgroup") release +; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: fence syncscope("workgroup") acquire +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[IN:%.*]], i64 0, !amdgpu.uniform !0, !amdgpu.noclobber !0 +; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[GEP]], align 4 +; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4 +; CHECK-NEXT: ret void +; +entry: + store volatile i32 0, i32 addrspace(3)* @LDS, align 4 + fence syncscope("workgroup") release + tail call void @llvm.amdgcn.s.barrier() + fence syncscope("workgroup") acquire + %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 0 + %ld = load i32, i32 addrspace(1)* %gep, align 4 + store i32 %ld, i32 addrspace(1)* %out, align 4 + ret void +} + +declare void @llvm.amdgcn.s.barrier() +declare void @llvm.amdgcn.wave.barrier()