diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h --- a/llvm/include/llvm/Transforms/IPO/Attributor.h +++ b/llvm/include/llvm/Transforms/IPO/Attributor.h @@ -2371,7 +2371,8 @@ /// IRAttribute::manifest is defined in the Attributor.cpp. struct IRAttributeManifest { static ChangeStatus manifestAttrs(Attributor &A, const IRPosition &IRP, - const ArrayRef &DeducedAttrs); + const ArrayRef &DeducedAttrs, + bool ForceReplace = false); }; /// Helper to tie a abstract state implementation to an abstract attribute. diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -96,6 +96,8 @@ void initializeAMDGPUAlwaysInlinePass(PassRegistry&); Pass *createAMDGPUAnnotateKernelFeaturesPass(); +Pass *createAMDGPUAttributorPass(); +void initializeAMDGPUAttributorPass(PassRegistry &); void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); extern char &AMDGPUAnnotateKernelFeaturesID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -0,0 +1,449 @@ +//===- AMDGPUAttributor.cpp -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This pass uses Attributor framework to deduce AMDGPU attributes. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/IntrinsicsR600.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/IPO/Attributor.h" + +#define DEBUG_TYPE "amdgpu-attributor" + +using namespace llvm; + +static constexpr StringLiteral ImplicitAttrNames[] = { + // X ids unnecessarily propagated to kernels. + "amdgpu-work-item-id-x", "amdgpu-work-item-id-y", + "amdgpu-work-item-id-z", "amdgpu-work-group-id-x", + "amdgpu-work-group-id-y", "amdgpu-work-group-id-z", + "amdgpu-dispatch-ptr", "amdgpu-dispatch-id", + "amdgpu-queue-ptr", "amdgpu-implicitarg-ptr"}; + +// We do not need to note the x workitem or workgroup id because they are always +// initialized. +// +// TODO: We should not add the attributes if the known compile time workgroup +// size is 1 for y/z. +static StringRef intrinsicToAttrName(Intrinsic::ID ID, bool &NonKernelOnly, + bool &IsQueuePtr) { + switch (ID) { + case Intrinsic::amdgcn_workitem_id_x: + NonKernelOnly = true; + return "amdgpu-work-item-id-x"; + case Intrinsic::amdgcn_workgroup_id_x: + NonKernelOnly = true; + return "amdgpu-work-group-id-x"; + case Intrinsic::amdgcn_workitem_id_y: + case Intrinsic::r600_read_tidig_y: + return "amdgpu-work-item-id-y"; + case Intrinsic::amdgcn_workitem_id_z: + case Intrinsic::r600_read_tidig_z: + return "amdgpu-work-item-id-z"; + case Intrinsic::amdgcn_workgroup_id_y: + case Intrinsic::r600_read_tgid_y: + return "amdgpu-work-group-id-y"; + case Intrinsic::amdgcn_workgroup_id_z: + case Intrinsic::r600_read_tgid_z: + return "amdgpu-work-group-id-z"; + case Intrinsic::amdgcn_dispatch_ptr: + return "amdgpu-dispatch-ptr"; + case Intrinsic::amdgcn_dispatch_id: + return "amdgpu-dispatch-id"; + case Intrinsic::amdgcn_kernarg_segment_ptr: + return "amdgpu-kernarg-segment-ptr"; + case Intrinsic::amdgcn_implicitarg_ptr: + return "amdgpu-implicitarg-ptr"; + case Intrinsic::amdgcn_queue_ptr: + case Intrinsic::amdgcn_is_shared: + case Intrinsic::amdgcn_is_private: + // TODO: Does not require queue ptr on gfx9+ + case Intrinsic::trap: + case Intrinsic::debugtrap: + IsQueuePtr = true; + return "amdgpu-queue-ptr"; + default: + return ""; + } +} + +class AMDGPUInformationCache : public InformationCache { +public: + AMDGPUInformationCache(const Module &M, AnalysisGetter &AG, + BumpPtrAllocator &Allocator, + SetVector *CGSCC, TargetMachine &TM) + : InformationCache(M, AG, Allocator, CGSCC), TM(TM) {} + TargetMachine &TM; +}; + +struct AAAMDInfo : public StateWrapper { + using Base = StateWrapper; + AAAMDInfo(const IRPosition &IRP, Attributor &A) : Base(IRP) {} + + /// Create an abstract attribute view for the position \p IRP. + static AAAMDInfo &createForPosition(const IRPosition &IRP, Attributor &A); + + /// See AbstractAttribute::getName(). + const std::string getName() const override { return "AAAMDAttributes"; } + + /// See AbstractAttribute::getIdAddr(). + const char *getIdAddr() const override { return &ID; } + + /// This function should return true if the type of the \p AA is + /// AAAMDAttributes. + static bool classof(const AbstractAttribute *AA) { + return (AA->getIdAddr() == &ID); + } + + virtual const DenseSet &getAttributes() const = 0; + + /// Unique ID (due to the unique address) + static const char ID; +}; +const char AAAMDInfo::ID = 0; + +struct AAAMDWorkGroupSize + : public StateWrapper { + using Base = StateWrapper; + AAAMDWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {} + + /// Create an abstract attribute view for the position \p IRP. + static AAAMDWorkGroupSize &createForPosition(const IRPosition &IRP, + Attributor &A); + + /// See AbstractAttribute::getName(). + const std::string getName() const override { return "AAAMDAttributes"; } + + /// See AbstractAttribute::getIdAddr(). + const char *getIdAddr() const override { return &ID; } + + /// This function should return true if the type of the \p AA is + /// AAAMDAttributes. + static bool classof(const AbstractAttribute *AA) { + return (AA->getIdAddr() == &ID); + } + + virtual Optional getUniformWorkGroupSize() const = 0; + + /// Unique ID (due to the unique address) + static const char ID; +}; +const char AAAMDWorkGroupSize::ID = 0; + +struct AAAMDWorkGroupSizeFunction : public AAAMDWorkGroupSize { + AAAMDWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A) + : AAAMDWorkGroupSize(IRP, A) {} + + void initialize(Attributor &A) override { + Function *F = getAssociatedFunction(); + if (F->hasFnAttribute("uniform-work-group-size")) + UniformWorkGroupSize = F->getFnAttribute("uniform-work-group-size") + .getValueAsString() + .equals("true"); + } + + ChangeStatus updateImpl(Attributor &A) override { + Function *F = getAssociatedFunction(); + ChangeStatus Change = ChangeStatus::UNCHANGED; + + const AACallEdges &AAEdges = A.getAAFor( + *this, this->getIRPosition(), DepClassTy::REQUIRED); + + // We don't have to track the changes for HasCall. + HasCall = + (AAEdges.getOptimisticEdges().size() > 0 || AAEdges.hasUnknownCallee()); + + // NOTE: Is this the correct way to handle this ? + // This code replicates the behaviour of AMDGPUAnnotateKernelFeatures.cpp + // If there is no (direct) call to a function we do not add + // uniform-work-group-size attribute see simple-indirect-call.ll + Optional NewUniformWorkGroupSize = UniformWorkGroupSize; + auto CheckCallSite = [&](AbstractCallSite CS) { + Function *Caller = CS.getInstruction()->getFunction(); + LLVM_DEBUG(dbgs() << "[AAAMDInfo] Call " << Caller->getName() << "->" + << F->getName() << "\n"); + HasCallSite = true; + + if (!F->hasExactDefinition()) { + LLVM_DEBUG(dbgs() << "[AAAMDInfo] Giving up: " << F->getName() << "\n"); + NewUniformWorkGroupSize = false; + return true; + } + + const auto &CallerInfo = A.getAAFor( + *this, IRPosition::function(*Caller), DepClassTy::REQUIRED); + // Propagate it from Caller to Callee. + Optional CallerWorkGroupSize = CallerInfo.getUniformWorkGroupSize(); + if (CallerWorkGroupSize.hasValue()) { + // Assume false if the callers have different values. + if (NewUniformWorkGroupSize.hasValue() && + NewUniformWorkGroupSize != CallerWorkGroupSize) { + LLVM_DEBUG(dbgs() << "[AAAMDInfo] WorkGroupSize conflict\n"); + NewUniformWorkGroupSize = false; + } + NewUniformWorkGroupSize = CallerWorkGroupSize; + } + + return true; + }; + + bool AllCallSitesKnown = true; + A.checkForAllCallSites(CheckCallSite, *this, false, AllCallSitesKnown); + + if (NewUniformWorkGroupSize != UniformWorkGroupSize) + Change = ChangeStatus::CHANGED; + UniformWorkGroupSize = NewUniformWorkGroupSize; + + return Change; + } + + ChangeStatus manifest(Attributor &A) override { + SmallVector AttrList; + LLVMContext &Ctx = getAssociatedFunction()->getContext(); + + // If the attribute is absent, assume it is false. + // Do not emit any attribute if no call or call site to replicate + // the behaviour of AMDGPUAnnotateKernelFeatures.cpp + if ((HasCallSite || HasCall) && !UniformWorkGroupSize.hasValue()) + UniformWorkGroupSize = false; + + if (UniformWorkGroupSize.hasValue()) + AttrList.push_back( + Attribute::get(Ctx, "uniform-work-group-size", + UniformWorkGroupSize.getValue() ? "true" : "false")); + return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList, + /* ForceReplace */ true); + } + + const std::string getAsStr() const override { + std::string state = "none"; + if (UniformWorkGroupSize.hasValue()) + state = std::to_string(UniformWorkGroupSize.getValue()); + + return "AMDWorkGroupSize[" + state + "]"; + } + + virtual Optional getUniformWorkGroupSize() const override { + return UniformWorkGroupSize; + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override {} + +private: + /// HasCall and HasCallSite are only used during manifest stage. + bool HasCall = false, HasCallSite = false; + Optional UniformWorkGroupSize = llvm::None; +}; + +AAAMDWorkGroupSize &AAAMDWorkGroupSize::createForPosition(const IRPosition &IRP, + Attributor &A) { + if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) + return *new (A.Allocator) AAAMDWorkGroupSizeFunction(IRP, A); + llvm_unreachable("AAAMDWorkGroupSize is only valid for function position"); +} + +struct AAAMDInfoFunction : public AAAMDInfo { + AAAMDInfoFunction(const IRPosition &IRP, Attributor &A) : AAAMDInfo(IRP, A) {} + + void initialize(Attributor &A) override { + Function *F = getAssociatedFunction(); + CallingConv::ID CC = F->getCallingConv(); + bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx); + // Ignore functions with graphics calling conventions, these are currently + // not allowed to have kernel arguments. + if (AMDGPU::isGraphics(F->getCallingConv())) { + indicatePessimisticFixpoint(); + return; + } + + for (StringRef Attr : ImplicitAttrNames) { + if (F->hasFnAttribute(Attr)) + Attributes.insert(Attr); + } + + // If this function hasAddressTaken() = true + // then add all attributes corresponding to the implicit args. + if (CallingConvSupportsAllImplicits && + F->hasAddressTaken(nullptr, true, true, true)) { + for (StringRef AttrName : ImplicitAttrNames) { + Attributes.insert(AttrName); + } + } + } + + ChangeStatus updateImpl(Attributor &A) override { + Function *F = getAssociatedFunction(); + ChangeStatus Change = ChangeStatus::UNCHANGED; + bool IsEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv()); + CallingConv::ID CC = F->getCallingConv(); + bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx); + auto &InfoCache = static_cast(A.getInfoCache()); + TargetMachine &TM = InfoCache.TM; + + auto AddAttribute = [&](StringRef AttrName) { + if (Attributes.insert(AttrName).second) + Change = ChangeStatus::CHANGED; + }; + + // Check for Intrinsics and propagate attributes. + const AACallEdges &AAEdges = A.getAAFor( + *this, this->getIRPosition(), DepClassTy::REQUIRED); + + // We have to assume that we can reach a function with these attributes. + if (CallingConvSupportsAllImplicits && AAEdges.hasUnknownCallee()) { + for (StringRef AttrName : ImplicitAttrNames) { + AddAttribute(AttrName); + } + } + + bool NeedsQueuePtr = false; + for (Function *Callee : AAEdges.getOptimisticEdges()) { + Intrinsic::ID IID = Callee->getIntrinsicID(); + if (IID != Intrinsic::not_intrinsic) { + if (!IsEntryFunc && IID == Intrinsic::amdgcn_kernarg_segment_ptr) { + AddAttribute("amdgpu-kernarg-segment-ptr"); + continue; + } + + bool NonKernelOnly = false; + StringRef AttrName = + intrinsicToAttrName(IID, NonKernelOnly, NeedsQueuePtr); + + if (!AttrName.empty() && (IsEntryFunc || !NonKernelOnly)) + AddAttribute(AttrName); + + continue; + } + + const AAAMDInfo &AAAMD = A.getAAFor( + *this, IRPosition::function(*F), DepClassTy::REQUIRED); + const DenseSet CalleeAttributes = AAAMD.getAttributes(); + // Propagate implicit attributes from called function. + for (StringRef AttrName : ImplicitAttrNames) + if (CalleeAttributes.count(AttrName)) + AddAttribute(AttrName); + } + + bool HasCall = + (AAEdges.getOptimisticEdges().size() > 0 || AAEdges.hasUnknownCallee()); + if (!IsEntryFunc && HasCall) + AddAttribute("amdgpu-calls"); + + // Check the function body. + auto CheckAlloca = [&](Instruction &I) { + AddAttribute("amdgpu-stack-objects"); + return false; + }; + + A.checkForAllInstructions(CheckAlloca, *this, {Instruction::Alloca}); + + auto CheckAddrSpaceCasts = [&](Instruction &I) { + unsigned SrcAS = static_cast(I).getSrcAddressSpace(); + if (SrcAS == AMDGPUAS::LOCAL_ADDRESS || + SrcAS == AMDGPUAS::PRIVATE_ADDRESS) + NeedsQueuePtr = true; + return true; + }; + + const GCNSubtarget &ST = TM.getSubtarget(*F); + bool HasApertureRegs = ST.hasApertureRegs(); + + // amdgpu-queue-ptr is not needed if aperture regs is present. + // FIXME: Walk through constant expressions. + if (!NeedsQueuePtr && (IsEntryFunc || !HasApertureRegs)) + A.checkForAllInstructions(CheckAddrSpaceCasts, *this, + {Instruction::AddrSpaceCast}); + + if (NeedsQueuePtr && (!IsEntryFunc && HasApertureRegs)) + AddAttribute("amdgpu-queue-ptr"); + return Change; + } + + ChangeStatus manifest(Attributor &A) override { + SmallVector AttrList; + LLVMContext &Ctx = getAssociatedFunction()->getContext(); + + for (StringRef AttrName : Attributes) + AttrList.push_back(Attribute::get(Ctx, AttrName)); + + return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList, + /* ForceReplace */ true); + } + + const std::string getAsStr() const override { + return "AMDInfo[" + std::to_string(Attributes.size()) + "]"; + } + + const DenseSet &getAttributes() const override { + return Attributes; + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override {} + +private: + /// HasCall and HasCallSite are only used during manifest stage. + DenseSet Attributes; +}; + +AAAMDInfo &AAAMDInfo::createForPosition(const IRPosition &IRP, Attributor &A) { + if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) + return *new (A.Allocator) AAAMDInfoFunction(IRP, A); + llvm_unreachable("AAAMDInfo is only valid for function position"); +} + +class AMDGPUAttributor : public ModulePass { +public: + AMDGPUAttributor() : ModulePass(ID) {} + + /// doInitialization - Virtual method overridden by subclasses to do + /// any necessary initialization before any pass is run. + bool doInitialization(Module &) override { + auto *TPC = getAnalysisIfAvailable(); + if (!TPC) + report_fatal_error("TargetMachine is required"); + + TM = &TPC->getTM(); + return false; + } + + bool runOnModule(Module &M) override { + SetVector Functions; + AnalysisGetter AG; + for (Function &F : M) + Functions.insert(&F); + + CallGraphUpdater CGUpdater; + BumpPtrAllocator Allocator; + AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM); + Attributor A(Functions, InfoCache, CGUpdater); + + for (Function &F : M) { + A.getOrCreateAAFor(IRPosition::function(F)); + A.getOrCreateAAFor(IRPosition::function(F)); + } + + ChangeStatus Change = A.run(); + return Change == ChangeStatus::CHANGED; + } + + StringRef getPassName() const override { return "AMDGPU Attributor"; } + TargetMachine *TM; + static char ID; +}; + +char AMDGPUAttributor::ID = 0; + +Pass *llvm::createAMDGPUAttributorPass() { return new AMDGPUAttributor(); } +INITIALIZE_PASS(AMDGPUAttributor, DEBUG_TYPE, "AMDGPU Attributor", false, false) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -234,6 +234,7 @@ initializeSILoadStoreOptimizerPass(*PR); initializeAMDGPUFixFunctionBitcastsPass(*PR); initializeAMDGPUAlwaysInlinePass(*PR); + initializeAMDGPUAttributorPass(*PR); initializeAMDGPUAnnotateKernelFeaturesPass(*PR); initializeAMDGPUAnnotateUniformValuesPass(*PR); initializeAMDGPUArgumentUsageInfoPass(*PR); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -44,6 +44,7 @@ AMDGPUAliasAnalysis.cpp AMDGPUAlwaysInlinePass.cpp AMDGPUAnnotateKernelFeatures.cpp + AMDGPUAttributor.cpp AMDGPUAnnotateUniformValues.cpp AMDGPUArgumentUsageInfo.cpp AMDGPUAsmPrinter.cpp diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp --- a/llvm/lib/Transforms/IPO/Attributor.cpp +++ b/llvm/lib/Transforms/IPO/Attributor.cpp @@ -197,12 +197,14 @@ /// attribute list \p Attrs. This is only the case if it was not already present /// in \p Attrs at the position describe by \p PK and \p AttrIdx. static bool addIfNotExistent(LLVMContext &Ctx, const Attribute &Attr, - AttributeList &Attrs, int AttrIdx) { + AttributeList &Attrs, int AttrIdx, + bool ForceReplace = false) { if (Attr.isEnumAttribute()) { Attribute::AttrKind Kind = Attr.getKindAsEnum(); if (Attrs.hasAttribute(AttrIdx, Kind)) - if (isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind))) + if (!ForceReplace && + isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind))) return false; Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr); return true; @@ -210,7 +212,8 @@ if (Attr.isStringAttribute()) { StringRef Kind = Attr.getKindAsString(); if (Attrs.hasAttribute(AttrIdx, Kind)) - if (isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind))) + if (!ForceReplace && + isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind))) return false; Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr); return true; @@ -218,7 +221,8 @@ if (Attr.isIntAttribute()) { Attribute::AttrKind Kind = Attr.getKindAsEnum(); if (Attrs.hasAttribute(AttrIdx, Kind)) - if (isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind))) + if (!ForceReplace && + isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind))) return false; Attrs = Attrs.removeAttribute(Ctx, AttrIdx, Kind); Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr); @@ -299,7 +303,8 @@ ChangeStatus IRAttributeManifest::manifestAttrs(Attributor &A, const IRPosition &IRP, - const ArrayRef &DeducedAttrs) { + const ArrayRef &DeducedAttrs, + bool ForceReplace) { Function *ScopeFn = IRP.getAnchorScope(); IRPosition::Kind PK = IRP.getPositionKind(); @@ -327,7 +332,7 @@ ChangeStatus HasChanged = ChangeStatus::UNCHANGED; LLVMContext &Ctx = IRP.getAnchorValue().getContext(); for (const Attribute &Attr : DeducedAttrs) { - if (!addIfNotExistent(Ctx, Attr, Attrs, IRP.getAttrIdx())) + if (!addIfNotExistent(Ctx, Attr, Attrs, IRP.getAttrIdx(), ForceReplace)) continue; HasChanged = ChangeStatus::CHANGED; @@ -2032,6 +2037,8 @@ // The alignment of a pointer is interesting for loads. case Instruction::Store: // The alignment of a pointer is interesting for stores. + case Instruction::Alloca: + case Instruction::AddrSpaceCast: IsInterestingOpcode = true; } if (IsInterestingOpcode) { diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll @@ -1,4 +1,5 @@ ; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefix=HSA %s +; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-attributor < %s | FileCheck -check-prefix=HSA %s target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll @@ -1,4 +1,5 @@ ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefix=NOHSA -check-prefix=ALL %s +; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-attributor < %s | FileCheck -check-prefix=NOHSA -check-prefix=ALL %s declare i32 @llvm.r600.read.tgid.x() #0 declare i32 @llvm.r600.read.tgid.y() #0 diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll @@ -1,4 +1,5 @@ ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=GCN %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-attributor %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s @@ -47,5 +48,6 @@ ret void } +;FIXME: The AMDGPU Attributor does not deduce the uniform-group-size attribute. ; attributes #0 = { "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" } ; attributes #1 = { "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-stack-objects" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll @@ -1,9 +1,14 @@ ; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck %s +; FIXME: opt -S -mtriple=amdgcn-amd- -amdgpu-attributor %s | FileCheck %s ; If the kernel does not have the uniform-work-group-attribute, set both callee and caller as false -; CHECK: define void @foo() #[[FOO:[0-9]+]] { +; sink function is added to prevent attributor from deleting the functions. +declare void @sink() + +; CHECK: define void @foo() #[[FOO:[0-9]+]] define void @foo() #0 { + call void @sink() ret void } diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll @@ -1,9 +1,15 @@ + ; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-attributor %s | FileCheck %s ; Test to verify if the attribute gets propagated across nested function calls -; CHECK: define void @func1() #[[FUNC:[0-9]+]] { +; Added to prevent Attributor from deleting calls. +declare void @sink() + +; CHECK: define void @func1() #[[FUNC:[0-9]+]] define void @func1() #0 { + call void @sink() ret void } @@ -20,6 +26,5 @@ } attributes #2 = { "uniform-work-group-size"="true" } - ; CHECK: attributes #[[FUNC]] = { "uniform-work-group-size"="true" } ; CHECK: attributes #[[KERNEL]] = { "amdgpu-calls" "uniform-work-group-size"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll @@ -1,9 +1,14 @@ ; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck %s +; FIXME: opt -S -mtriple=amdgcn-amd- -amdgpu-attributor %s | FileCheck %s + +; Function added to prevent attributor from deleting call sites. +declare void @sink() ; Two kernels with different values of the uniform-work-group-attribute call the same function ; CHECK: define void @func() #[[FUNC:[0-9]+]] { define void @func() #0 { + call void @sink() ret void } diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll @@ -1,8 +1,13 @@ ; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-attributor %s | FileCheck %s + +; function added to prevent attributor from deleting calls. +declare void @sink() ; Propagate the uniform-work-group-attribute from the kernel to callee if it doesn't have it ; CHECK: define void @func() #[[FUNC:[0-9]+]] { define void @func() #0 { + call void @sink() ret void } @@ -15,6 +20,7 @@ ; External declaration of a function ; CHECK: define weak_odr void @weak_func() #[[FUNC]] { define weak_odr void @weak_func() #0 { + call void @sink() ret void } diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll @@ -1,4 +1,6 @@ ; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-attributor %s | FileCheck %s + ; Test to ensure recursive functions exhibit proper behaviour ; Test to generate fibonacci numbers @@ -32,7 +34,9 @@ ret void } +; nounwind and readnone are added to match attributor results. +attributes #0 = { nounwind readnone } attributes #1 = { "uniform-work-group-size"="true" } -; CHECK: attributes #[[FIB]] = { "uniform-work-group-size"="true" } +; CHECK: attributes #[[FIB]] = { nounwind readnone "uniform-work-group-size"="true" } ; CHECK: attributes #[[KERNEL]] = { "amdgpu-calls" "uniform-work-group-size"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll @@ -1,29 +1,27 @@ ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-attributor %s | FileCheck %s -; CHECK: define void @func1() #[[FUNC:[0-9]+]] { -define void @func1() { - ret void -} -; CHECK: define void @func4() #[[FUNC]] { -define void @func4() { - ret void -} +; CHECK: declare void @func1() #[[FUNC0:[0-9]+]] +declare void @func1() + +; CHECK: declare void @func4() #[[FUNC0]] +declare void @func4() -; CHECK: define void @func2() #[[FUNC]] { +; CHECK: define void @func2() #[[FUNC0]] { define void @func2() #0 { call void @func4() call void @func1() ret void } -; CHECK: define void @func3() #[[FUNC]] { +; CHECK: define void @func3() #[[FUNC0]] { define void @func3() { call void @func1() ret void } -; CHECK: define amdgpu_kernel void @kernel3() #[[FUNC:[0-9]+]] { +; CHECK: define amdgpu_kernel void @kernel3() #[[FUNC1:[0-9]+]] { define amdgpu_kernel void @kernel3() #0 { call void @func2() call void @func3() @@ -32,4 +30,5 @@ attributes #0 = { "uniform-work-group-size"="false" } -; CHECK: attributes #[[FUNC]] = { "amdgpu-calls" "uniform-work-group-size"="false" } +; CHECK: attributes #[[FUNC0]] = { "uniform-work-group-size"="false" } +; CHECK: attributes #[[FUNC1]] = { "amdgpu-calls" "uniform-work-group-size"="false" }