Index: llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -14,12 +14,11 @@ #include "AMDGPU.h" #include "Utils/AMDGPUBaseInfo.h" -#include "llvm/ADT/SmallSet.h" +#include "Utils/AMDGPUMemoryUtils.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/IR/InstVisitor.h" -#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/InitializePasses.h" #define DEBUG_TYPE "amdgpu-annotate-uniform" @@ -53,7 +52,6 @@ void visitBranchInst(BranchInst &I); void visitLoadInst(LoadInst &I); - bool isClobberedInFunction(LoadInst * Load); }; } // End anonymous namespace @@ -75,81 +73,6 @@ I->setMetadata("amdgpu.noclobber", MDNode::get(I->getContext(), {})); } -bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst *Load) { - MemorySSAWalker *Walker = MSSA->getWalker(); - SmallVector WorkList{Walker->getClobberingMemoryAccess(Load)}; - SmallSet Visited; - MemoryLocation Loc(MemoryLocation::get(Load)); - - const auto isReallyAClobber = [this, Load](MemoryDef *Def) -> bool { - Instruction *DefInst = Def->getMemoryInst(); - LLVM_DEBUG(dbgs() << " Def: " << *DefInst << '\n'); - - if (isa(DefInst)) - return false; - - if (const IntrinsicInst *II = dyn_cast(DefInst)) { - switch (II->getIntrinsicID()) { - case Intrinsic::amdgcn_s_barrier: - case Intrinsic::amdgcn_wave_barrier: - return false; - default: - break; - } - } - - // Ignore atomics not aliasing with the original load, any atomic is a - // universal MemoryDef from MSSA's point of view too, just like a fence. - const auto checkNoAlias = [this, Load](auto I) -> bool { - return I && AA->isNoAlias(I->getPointerOperand(), - Load->getPointerOperand()); - }; - - if (checkNoAlias(dyn_cast(DefInst)) || - checkNoAlias(dyn_cast(DefInst))) - return false; - - return true; - }; - - LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n'); - - // Start with a nearest dominating clobbering access, it will be either - // live on entry (nothing to do, load is not clobbered), MemoryDef, or - // MemoryPhi if several MemoryDefs can define this memory state. In that - // case add all Defs to WorkList and continue going up and checking all - // the definitions of this memory location until the root. When all the - // defs are exhausted and came to the entry state we have no clobber. - // Along the scan ignore barriers and fences which are considered clobbers - // by the MemorySSA, but not really writing anything into the memory. - while (!WorkList.empty()) { - MemoryAccess *MA = WorkList.pop_back_val(); - if (!Visited.insert(MA).second) - continue; - - if (MSSA->isLiveOnEntryDef(MA)) - continue; - - if (MemoryDef *Def = dyn_cast(MA)) { - if (isReallyAClobber(Def)) { - LLVM_DEBUG(dbgs() << " -> load is clobbered\n"); - return true; - } - - WorkList.push_back( - Walker->getClobberingMemoryAccess(Def->getDefiningAccess(), Loc)); - continue; - } - - const MemoryPhi *Phi = cast(MA); - for (auto &Use : Phi->incoming_values()) - WorkList.push_back(cast(&Use)); - } - - LLVM_DEBUG(dbgs() << " -> no clobber\n"); - return false; -} - void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) { if (DA->isUniform(&I)) setUniformMetadata(&I); @@ -169,8 +92,7 @@ if (!isEntryFunc) return; bool GlobalLoad = I.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; - bool NotClobbered = GlobalLoad && !isClobberedInFunction(&I); - if (NotClobbered) + if (GlobalLoad && !AMDGPU::isClobberedInFunction(&I, MSSA, AA)) setNoClobberMetadata(&I); } Index: llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp @@ -16,7 +16,9 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "Utils/AMDGPUMemoryUtils.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/IR/IRBuilder.h" #include "llvm/InitializePasses.h" @@ -30,6 +32,8 @@ class AMDGPUPromoteKernelArguments : public FunctionPass { MemorySSA *MSSA; + AliasAnalysis *AA; + Instruction *ArgCastInsertPt; SmallVector Ptrs; @@ -43,11 +47,12 @@ AMDGPUPromoteKernelArguments() : FunctionPass(ID) {} - bool run(Function &F, MemorySSA &MSSA); + bool run(Function &F, MemorySSA &MSSA, AliasAnalysis &AA); bool runOnFunction(Function &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); AU.addRequired(); AU.setPreservesAll(); } @@ -75,9 +80,8 @@ PT->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) || LD->getPointerOperand()->stripInBoundsOffsets() != Ptr) break; - const MemoryAccess *MA = MSSA->getWalker()->getClobberingMemoryAccess(LD); // TODO: This load poprobably can be promoted to constant address space. - if (MSSA->isLiveOnEntryDef(MA)) + if (!AMDGPU::isClobberedInFunction(LD, MSSA, AA)) Ptrs.push_back(LD); break; } @@ -131,7 +135,8 @@ return InsPt; } -bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA) { +bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA, + AliasAnalysis &AA) { if (skipFunction(F)) return false; @@ -141,6 +146,7 @@ ArgCastInsertPt = &*getInsertPt(*F.begin()); this->MSSA = &MSSA; + this->AA = &AA; for (Argument &Arg : F.args()) { if (Arg.use_empty()) @@ -166,11 +172,13 @@ bool AMDGPUPromoteKernelArguments::runOnFunction(Function &F) { MemorySSA &MSSA = getAnalysis().getMSSA(); - return run(F, MSSA); + AliasAnalysis &AA = getAnalysis().getAAResults(); + return run(F, MSSA, AA); } INITIALIZE_PASS_BEGIN(AMDGPUPromoteKernelArguments, DEBUG_TYPE, "AMDGPU Promote Kernel Arguments", false, false) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) INITIALIZE_PASS_END(AMDGPUPromoteKernelArguments, DEBUG_TYPE, "AMDGPU Promote Kernel Arguments", false, false) @@ -185,7 +193,8 @@ AMDGPUPromoteKernelArgumentsPass::run(Function &F, FunctionAnalysisManager &AM) { MemorySSA &MSSA = AM.getResult(F).getMSSA(); - if (AMDGPUPromoteKernelArguments().run(F, MSSA)) { + AliasAnalysis &AA = AM.getResult(F); + if (AMDGPUPromoteKernelArguments().run(F, MSSA, AA)) { PreservedAnalyses PA; PA.preserveSet(); PA.preserve(); Index: llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h =================================================================== --- /dev/null +++ llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h @@ -0,0 +1,35 @@ +//===- AMDGPUMemoryUtils.h - Memory related helper functions -*- C++ -*----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H +#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H + +namespace llvm { + +class AAResults; +class LoadInst; +class MemoryDef; +class MemorySSA; +class Value; + +namespace AMDGPU { + +/// Given a \p Def clobbering a load from \p Ptr accroding to the MSSA check +/// if this is actually a memory update or an artifical clobber to facilitate +/// ordering constraints. +bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA); + +/// Check is a \p Load is clobbered in its function. +bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA, + AAResults *AA); + +} // end namespace AMDGPU + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H Index: llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp =================================================================== --- /dev/null +++ llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp @@ -0,0 +1,104 @@ +//===-- AMDGPUMemoryUtils.cpp - -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUMemoryUtils.h" +#include "AMDGPU.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/MemorySSA.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/IntrinsicInst.h" + +#define DEBUG_TYPE "amdgpu-memory-utils" + +using namespace llvm; + +namespace llvm { + +namespace AMDGPU { + +bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) { + Instruction *DefInst = Def->getMemoryInst(); + + if (isa(DefInst)) + return false; + + if (const IntrinsicInst *II = dyn_cast(DefInst)) { + switch (II->getIntrinsicID()) { + case Intrinsic::amdgcn_s_barrier: + case Intrinsic::amdgcn_wave_barrier: + return false; + default: + break; + } + } + + // Ignore atomics not aliasing with the original load, any atomic is a + // universal MemoryDef from MSSA's point of view too, just like a fence. + const auto checkNoAlias = [AA, Ptr](auto I) -> bool { + return I && AA->isNoAlias(I->getPointerOperand(), Ptr); + }; + + if (checkNoAlias(dyn_cast(DefInst)) || + checkNoAlias(dyn_cast(DefInst))) + return false; + + return true; +} + +bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA, + AAResults *AA) { + MemorySSAWalker *Walker = MSSA->getWalker(); + SmallVector WorkList{Walker->getClobberingMemoryAccess(Load)}; + SmallSet Visited; + MemoryLocation Loc(MemoryLocation::get(Load)); + + LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n'); + + // Start with a nearest dominating clobbering access, it will be either + // live on entry (nothing to do, load is not clobbered), MemoryDef, or + // MemoryPhi if several MemoryDefs can define this memory state. In that + // case add all Defs to WorkList and continue going up and checking all + // the definitions of this memory location until the root. When all the + // defs are exhausted and came to the entry state we have no clobber. + // Along the scan ignore barriers and fences which are considered clobbers + // by the MemorySSA, but not really writing anything into the memory. + while (!WorkList.empty()) { + MemoryAccess *MA = WorkList.pop_back_val(); + if (!Visited.insert(MA).second) + continue; + + if (MSSA->isLiveOnEntryDef(MA)) + continue; + + if (MemoryDef *Def = dyn_cast(MA)) { + LLVM_DEBUG(dbgs() << " Def: " << *Def->getMemoryInst() << '\n'); + + if (isReallyAClobber(Load->getPointerOperand(), Def, AA)) { + LLVM_DEBUG(dbgs() << " -> load is clobbered\n"); + return true; + } + + WorkList.push_back( + Walker->getClobberingMemoryAccess(Def->getDefiningAccess(), Loc)); + continue; + } + + const MemoryPhi *Phi = cast(MA); + for (auto &Use : Phi->incoming_values()) + WorkList.push_back(cast(&Use)); + } + + LLVM_DEBUG(dbgs() << " -> no clobber\n"); + return false; +} + +} // end namespace AMDGPU + +} // end namespace llvm Index: llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt =================================================================== --- llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt +++ llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt @@ -2,6 +2,7 @@ AMDGPUAsmUtils.cpp AMDGPUBaseInfo.cpp AMDGPULDSUtils.cpp + AMDGPUMemoryUtils.cpp AMDGPUPALMetadata.cpp AMDKernelCodeTUtils.cpp Index: llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll +++ llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll @@ -314,4 +314,31 @@ ret void } +; GCN-LABEL: ptr_nest_3_barrier: +; GCN-COUNT-2: global_load_dwordx2 +; GCN: global_store_dword +define amdgpu_kernel void @ptr_nest_3_barrier(float** addrspace(1)* nocapture readonly %Arg) { +; CHECK-LABEL: @ptr_nest_3_barrier( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float**, float** addrspace(1)* [[ARG:%.*]], i32 [[I]] +; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: [[P2:%.*]] = load float**, float** addrspace(1)* [[P1]], align 8 +; CHECK-NEXT: [[P2_GLOBAL:%.*]] = addrspacecast float** [[P2]] to float* addrspace(1)* +; CHECK-NEXT: [[P3:%.*]] = load float*, float* addrspace(1)* [[P2_GLOBAL]], align 8 +; CHECK-NEXT: [[P3_GLOBAL:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)* +; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* [[P3_GLOBAL]], align 4 +; CHECK-NEXT: ret void +; +entry: + %i = tail call i32 @llvm.amdgcn.workitem.id.x() + %p1 = getelementptr inbounds float**, float** addrspace(1)* %Arg, i32 %i + tail call void @llvm.amdgcn.s.barrier() + %p2 = load float**, float** addrspace(1)* %p1, align 8 + %p3 = load float*, float** %p2, align 8 + store float 0.000000e+00, float* %p3, align 4 + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() +declare void @llvm.amdgcn.s.barrier()