diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h --- a/llvm/include/llvm/Transforms/IPO/Attributor.h +++ b/llvm/include/llvm/Transforms/IPO/Attributor.h @@ -1519,6 +1519,13 @@ bool CheckBBLivenessOnly = false, DepClassTy DepClass = DepClassTy::OPTIONAL); + /// Return true if \p BB is assumed dead. + /// + /// If \p LivenessAA is not provided it is queried. + bool isAssumedDead(const BasicBlock &BB, const AbstractAttribute *QueryingAA, + const AAIsDead *FnLivenessAA, + DepClassTy DepClass = DepClassTy::OPTIONAL); + /// Check \p Pred on all (transitive) uses of \p V. /// /// This method will evaluate \p Pred on all (transitive) uses of the @@ -2371,7 +2378,8 @@ /// IRAttribute::manifest is defined in the Attributor.cpp. struct IRAttributeManifest { static ChangeStatus manifestAttrs(Attributor &A, const IRPosition &IRP, - const ArrayRef &DeducedAttrs); + const ArrayRef &DeducedAttrs, + bool ForceReplace = false); }; /// Helper to tie a abstract state implementation to an abstract attribute. diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -96,6 +96,8 @@ void initializeAMDGPUAlwaysInlinePass(PassRegistry&); Pass *createAMDGPUAnnotateKernelFeaturesPass(); +Pass *createAMDGPUAttributorPass(); +void initializeAMDGPUAttributorPass(PassRegistry &); void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); extern char &AMDGPUAnnotateKernelFeaturesID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -0,0 +1,538 @@ +//===- AMDGPUAttributor.cpp -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This pass uses Attributor framework to deduce AMDGPU attributes. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/IntrinsicsR600.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/IPO/Attributor.h" + +#define DEBUG_TYPE "amdgpu-attributor" + +using namespace llvm; + +static constexpr StringLiteral ImplicitAttrNames[] = { + // X ids unnecessarily propagated to kernels. + "amdgpu-work-item-id-x", "amdgpu-work-item-id-y", + "amdgpu-work-item-id-z", "amdgpu-work-group-id-x", + "amdgpu-work-group-id-y", "amdgpu-work-group-id-z", + "amdgpu-dispatch-ptr", "amdgpu-dispatch-id", + "amdgpu-queue-ptr", "amdgpu-implicitarg-ptr"}; + +// We do not need to note the x workitem or workgroup id because they are always +// initialized. +// +// TODO: We should not add the attributes if the known compile time workgroup +// size is 1 for y/z. +static StringRef intrinsicToAttrName(Intrinsic::ID ID, bool &NonKernelOnly, + bool &IsQueuePtr) { + switch (ID) { + case Intrinsic::amdgcn_workitem_id_x: + NonKernelOnly = true; + return "amdgpu-work-item-id-x"; + case Intrinsic::amdgcn_workgroup_id_x: + NonKernelOnly = true; + return "amdgpu-work-group-id-x"; + case Intrinsic::amdgcn_workitem_id_y: + case Intrinsic::r600_read_tidig_y: + return "amdgpu-work-item-id-y"; + case Intrinsic::amdgcn_workitem_id_z: + case Intrinsic::r600_read_tidig_z: + return "amdgpu-work-item-id-z"; + case Intrinsic::amdgcn_workgroup_id_y: + case Intrinsic::r600_read_tgid_y: + return "amdgpu-work-group-id-y"; + case Intrinsic::amdgcn_workgroup_id_z: + case Intrinsic::r600_read_tgid_z: + return "amdgpu-work-group-id-z"; + case Intrinsic::amdgcn_dispatch_ptr: + return "amdgpu-dispatch-ptr"; + case Intrinsic::amdgcn_dispatch_id: + return "amdgpu-dispatch-id"; + case Intrinsic::amdgcn_kernarg_segment_ptr: + return "amdgpu-kernarg-segment-ptr"; + case Intrinsic::amdgcn_implicitarg_ptr: + return "amdgpu-implicitarg-ptr"; + case Intrinsic::amdgcn_queue_ptr: + case Intrinsic::amdgcn_is_shared: + case Intrinsic::amdgcn_is_private: + // TODO: Does not require queue ptr on gfx9+ + case Intrinsic::trap: + case Intrinsic::debugtrap: + IsQueuePtr = true; + return "amdgpu-queue-ptr"; + default: + return ""; + } +} + +static bool castRequiresQueuePtr(unsigned SrcAS) { + return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS; +} + +static bool isDSAddress(const Constant *C) { + const GlobalValue *GV = dyn_cast(C); + if (!GV) + return false; + unsigned AS = GV->getAddressSpace(); + return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS; +} + +class AMDGPUInformationCache : public InformationCache { +public: + AMDGPUInformationCache(const Module &M, AnalysisGetter &AG, + BumpPtrAllocator &Allocator, + SetVector *CGSCC, TargetMachine &TM) + : InformationCache(M, AG, Allocator, CGSCC), TM(TM) {} + TargetMachine &TM; + + enum ConstantStatus { DS_GLOBAL = 1 << 0, ADDR_SPACE_CAST = 1 << 1 }; + + bool hasApertureRegs(Function &F) { + const GCNSubtarget &ST = TM.getSubtarget(F); + return ST.hasApertureRegs(); + } + +private: + static bool visitConstExpr(const ConstantExpr *CE) { + if (CE->getOpcode() == Instruction::AddrSpaceCast) { + unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace(); + return castRequiresQueuePtr(SrcAS); + } + return false; + } + + uint8_t getConstantAccess(const Constant *C) { + auto It = ConstantStatus.find(C); + if (It != ConstantStatus.end()) + return It->second; + + uint8_t Result = 0; + if (isDSAddress(C)) + Result = DS_GLOBAL; + + if (const auto *CE = dyn_cast(C)) + if (visitConstExpr(CE)) + Result |= ADDR_SPACE_CAST; + + for (const Use &U : C->operands()) { + const auto *OpC = dyn_cast(U); + if (!OpC) + continue; + + Result |= getConstantAccess(OpC); + } + return Result; + } + +public: + bool needsQueuePtr(const Constant *C, Function &Fn) { + bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv()); + bool HasAperture = hasApertureRegs(Fn); + + // No need to explore the constants. + if (!IsNonEntryFunc && HasAperture) + return false; + + uint8_t Access = getConstantAccess(C); + + // We need to trap on DS globals in non-entry functions. + if (IsNonEntryFunc && (Access & DS_GLOBAL)) + return true; + + return !HasAperture && (Access & ADDR_SPACE_CAST); + } + +private: + DenseMap ConstantStatus; +}; + +struct AAAMDAttributes : public StateWrapper { + using Base = StateWrapper; + AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {} + + /// Create an abstract attribute view for the position \p IRP. + static AAAMDAttributes &createForPosition(const IRPosition &IRP, + Attributor &A); + + /// See AbstractAttribute::getName(). + const std::string getName() const override { return "AAAMDAttributes"; } + + /// See AbstractAttribute::getIdAddr(). + const char *getIdAddr() const override { return &ID; } + + /// This function should return true if the type of the \p AA is + /// AAAMDAttributes. + static bool classof(const AbstractAttribute *AA) { + return (AA->getIdAddr() == &ID); + } + + virtual const DenseSet &getAttributes() const = 0; + + /// Unique ID (due to the unique address) + static const char ID; +}; +const char AAAMDAttributes::ID = 0; + +struct AAAMDWorkGroupSize + : public StateWrapper { + using Base = StateWrapper; + AAAMDWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {} + + /// Create an abstract attribute view for the position \p IRP. + static AAAMDWorkGroupSize &createForPosition(const IRPosition &IRP, + Attributor &A); + + /// See AbstractAttribute::getName(). + const std::string getName() const override { return "AAAMDWorkGroupSize"; } + + /// See AbstractAttribute::getIdAddr(). + const char *getIdAddr() const override { return &ID; } + + /// This function should return true if the type of the \p AA is + /// AAAMDAttributes. + static bool classof(const AbstractAttribute *AA) { + return (AA->getIdAddr() == &ID); + } + + virtual Optional getUniformWorkGroupSize() const = 0; + + /// Unique ID (due to the unique address) + static const char ID; +}; +const char AAAMDWorkGroupSize::ID = 0; + +struct AAAMDWorkGroupSizeFunction : public AAAMDWorkGroupSize { + AAAMDWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A) + : AAAMDWorkGroupSize(IRP, A) {} + + void initialize(Attributor &A) override { + Function *F = getAssociatedFunction(); + CallingConv::ID CC = F->getCallingConv(); + if (F->hasFnAttribute("uniform-work-group-size")) + UniformWorkGroupSize = F->getFnAttribute("uniform-work-group-size") + .getValueAsString() + .equals("true"); + else if (CC == CallingConv::AMDGPU_KERNEL) + UniformWorkGroupSize = false; + + LLVM_DEBUG(dbgs() << "Initial value " << UniformWorkGroupSize << "\n"); + } + + ChangeStatus updateImpl(Attributor &A) override { + Function *F = getAssociatedFunction(); + ChangeStatus Change = ChangeStatus::UNCHANGED; + + Optional NewUniformWorkGroupSize = UniformWorkGroupSize; + auto CheckCallSite = [&](AbstractCallSite CS) { + Function *Caller = CS.getInstruction()->getFunction(); + LLVM_DEBUG(dbgs() << "[AAAMDWorkGroupSize] Call " << Caller->getName() + << "->" << F->getName() << "\n"); + + if (!F->hasExactDefinition()) { + LLVM_DEBUG(dbgs() << "[AMDWorkGroupSize] Giving up: " << F->getName() + << "\n"); + NewUniformWorkGroupSize = false; + return true; + } + + const auto &CallerInfo = A.getAAFor( + *this, IRPosition::function(*Caller), DepClassTy::REQUIRED); + // Propagate it from Caller to Callee. + Optional CallerWorkGroupSize = CallerInfo.getUniformWorkGroupSize(); + if (CallerWorkGroupSize.hasValue()) { + // Assume false if the callers have different values. + if (NewUniformWorkGroupSize.hasValue() && + NewUniformWorkGroupSize != CallerWorkGroupSize) { + LLVM_DEBUG(dbgs() << "[AAAMDWorkGroupSize] WorkGroupSize conflict\n"); + NewUniformWorkGroupSize = false; + return false; + } + NewUniformWorkGroupSize = CallerWorkGroupSize; + } + + return true; + }; + + bool AllCallSitesKnown = true; + A.checkForAllCallSites(CheckCallSite, *this, false, AllCallSitesKnown); + + if (NewUniformWorkGroupSize != UniformWorkGroupSize) + Change = ChangeStatus::CHANGED; + UniformWorkGroupSize = NewUniformWorkGroupSize; + + return Change; + } + + ChangeStatus manifest(Attributor &A) override { + SmallVector AttrList; + LLVMContext &Ctx = getAssociatedFunction()->getContext(); + + if (!UniformWorkGroupSize.hasValue()) + UniformWorkGroupSize = false; + + AttrList.push_back( + Attribute::get(Ctx, "uniform-work-group-size", + UniformWorkGroupSize.getValue() ? "true" : "false")); + return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList, + /* ForceReplace */ true); + } + + const std::string getAsStr() const override { + std::string state = "none"; + if (UniformWorkGroupSize.hasValue()) + state = std::to_string(UniformWorkGroupSize.getValue()); + + return "AMDWorkGroupSize[" + state + "]"; + } + + virtual Optional getUniformWorkGroupSize() const override { + return UniformWorkGroupSize; + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override {} + +private: + Optional UniformWorkGroupSize = llvm::None; +}; + +AAAMDWorkGroupSize &AAAMDWorkGroupSize::createForPosition(const IRPosition &IRP, + Attributor &A) { + if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) + return *new (A.Allocator) AAAMDWorkGroupSizeFunction(IRP, A); + llvm_unreachable("AAAMDWorkGroupSize is only valid for function position"); +} + +struct AAAMDAttributesFunction : public AAAMDAttributes { + AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A) + : AAAMDAttributes(IRP, A) {} + + void initialize(Attributor &A) override { + Function *F = getAssociatedFunction(); + CallingConv::ID CC = F->getCallingConv(); + bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx); + // Ignore functions with graphics calling conventions, these are currently + // not allowed to have kernel arguments. + if (AMDGPU::isGraphics(F->getCallingConv())) { + indicatePessimisticFixpoint(); + return; + } + + for (StringRef Attr : ImplicitAttrNames) { + if (F->hasFnAttribute(Attr)) + Attributes.insert(Attr); + } + + if (CallingConvSupportsAllImplicits && + F->hasAddressTaken(nullptr, true, true, true)) { + for (StringRef AttrName : ImplicitAttrNames) { + Attributes.insert(AttrName); + } + } + } + + ChangeStatus updateImpl(Attributor &A) override { + Function *F = getAssociatedFunction(); + ChangeStatus Change = ChangeStatus::UNCHANGED; + bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv()); + CallingConv::ID CC = F->getCallingConv(); + bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx); + auto &InfoCache = static_cast(A.getInfoCache()); + + auto AddAttribute = [&](StringRef AttrName) { + if (Attributes.insert(AttrName).second) + Change = ChangeStatus::CHANGED; + }; + + // Check for Intrinsics and propagate attributes. + const AACallEdges &AAEdges = A.getAAFor( + *this, this->getIRPosition(), DepClassTy::REQUIRED); + + // We have to assume that we can reach a function with these attributes. + if (CallingConvSupportsAllImplicits && AAEdges.hasUnknownCallee()) { + for (StringRef AttrName : ImplicitAttrNames) { + AddAttribute(AttrName); + } + } + + bool NeedsQueuePtr = false; + bool HasCall = false; + for (Function *Callee : AAEdges.getOptimisticEdges()) { + Intrinsic::ID IID = Callee->getIntrinsicID(); + if (IID != Intrinsic::not_intrinsic) { + if (!IsNonEntryFunc && IID == Intrinsic::amdgcn_kernarg_segment_ptr) { + AddAttribute("amdgpu-kernarg-segment-ptr"); + continue; + } + + bool NonKernelOnly = false; + StringRef AttrName = + intrinsicToAttrName(IID, NonKernelOnly, NeedsQueuePtr); + + if (!AttrName.empty() && (IsNonEntryFunc || !NonKernelOnly)) + AddAttribute(AttrName); + + continue; + } + + HasCall = true; + const AAAMDAttributes &AAAMD = A.getAAFor( + *this, IRPosition::function(*Callee), DepClassTy::REQUIRED); + const DenseSet &CalleeAttributes = AAAMD.getAttributes(); + // Propagate implicit attributes from called function. + for (StringRef AttrName : ImplicitAttrNames) + if (CalleeAttributes.count(AttrName)) + AddAttribute(AttrName); + } + + HasCall |= AAEdges.hasUnknownCallee(); + if (!IsNonEntryFunc && HasCall) + AddAttribute("amdgpu-calls"); + + // Check the function body. + auto CheckAlloca = [&](Instruction &I) { + AddAttribute("amdgpu-stack-objects"); + return false; + }; + + A.checkForAllInstructions(CheckAlloca, *this, {Instruction::Alloca}); + + auto CheckAddrSpaceCasts = [&](Instruction &I) { + unsigned SrcAS = static_cast(I).getSrcAddressSpace(); + if (castRequiresQueuePtr(SrcAS)) { + NeedsQueuePtr = true; + return false; + } + return true; + }; + + // If we found that we need amdgpu-queue-ptr, nothing else to do. + if (NeedsQueuePtr || Attributes.count("amdgpu-queue-ptr")) { + AddAttribute("amdgpu-queue-ptr"); + return Change; + } + + bool HasApertureRegs = InfoCache.hasApertureRegs(*F); + + // `checkForAllInstructions` is much more cheaper than going through all + // instructions, try it first. + + // amdgpu-queue-ptr is not needed if aperture regs is present. + if (!HasApertureRegs) + A.checkForAllInstructions(CheckAddrSpaceCasts, *this, + {Instruction::AddrSpaceCast}); + + // If we found that we need amdgpu-queue-ptr, nothing else to do. + if (NeedsQueuePtr) { + AddAttribute("amdgpu-queue-ptr"); + return Change; + } + + if (!IsNonEntryFunc && HasApertureRegs) + return Change; + + for (BasicBlock &BB : *F) { + for (Instruction &I : BB) { + for (const Use &U : I.operands()) { + if (const auto *C = dyn_cast(U)) { + if (InfoCache.needsQueuePtr(C, *F)) { + AddAttribute("amdgpu-queue-ptr"); + return Change; + } + } + } + } + } + + return Change; + } + + ChangeStatus manifest(Attributor &A) override { + SmallVector AttrList; + LLVMContext &Ctx = getAssociatedFunction()->getContext(); + + for (StringRef AttrName : Attributes) + AttrList.push_back(Attribute::get(Ctx, AttrName)); + + return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList, + /* ForceReplace */ true); + } + + const std::string getAsStr() const override { + return "AMDInfo[" + std::to_string(Attributes.size()) + "]"; + } + + const DenseSet &getAttributes() const override { + return Attributes; + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override {} + +private: + DenseSet Attributes; +}; + +AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP, + Attributor &A) { + if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) + return *new (A.Allocator) AAAMDAttributesFunction(IRP, A); + llvm_unreachable("AAAMDAttributes is only valid for function position"); +} + +class AMDGPUAttributor : public ModulePass { +public: + AMDGPUAttributor() : ModulePass(ID) {} + + /// doInitialization - Virtual method overridden by subclasses to do + /// any necessary initialization before any pass is run. + bool doInitialization(Module &) override { + auto *TPC = getAnalysisIfAvailable(); + if (!TPC) + report_fatal_error("TargetMachine is required"); + + TM = &TPC->getTM(); + return false; + } + + bool runOnModule(Module &M) override { + SetVector Functions; + AnalysisGetter AG; + for (Function &F : M) + Functions.insert(&F); + + CallGraphUpdater CGUpdater; + BumpPtrAllocator Allocator; + AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM); + Attributor A(Functions, InfoCache, CGUpdater); + + for (Function &F : M) { + A.getOrCreateAAFor(IRPosition::function(F)); + A.getOrCreateAAFor(IRPosition::function(F)); + } + + ChangeStatus Change = A.run(); + return Change == ChangeStatus::CHANGED; + } + + StringRef getPassName() const override { return "AMDGPU Attributor"; } + TargetMachine *TM; + static char ID; +}; + +char AMDGPUAttributor::ID = 0; + +Pass *llvm::createAMDGPUAttributorPass() { return new AMDGPUAttributor(); } +INITIALIZE_PASS(AMDGPUAttributor, DEBUG_TYPE, "AMDGPU Attributor", false, false) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -234,6 +234,7 @@ initializeSILoadStoreOptimizerPass(*PR); initializeAMDGPUFixFunctionBitcastsPass(*PR); initializeAMDGPUAlwaysInlinePass(*PR); + initializeAMDGPUAttributorPass(*PR); initializeAMDGPUAnnotateKernelFeaturesPass(*PR); initializeAMDGPUAnnotateUniformValuesPass(*PR); initializeAMDGPUArgumentUsageInfoPass(*PR); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -44,6 +44,7 @@ AMDGPUAliasAnalysis.cpp AMDGPUAlwaysInlinePass.cpp AMDGPUAnnotateKernelFeatures.cpp + AMDGPUAttributor.cpp AMDGPUAnnotateUniformValues.cpp AMDGPUArgumentUsageInfo.cpp AMDGPUAsmPrinter.cpp diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp --- a/llvm/lib/Transforms/IPO/Attributor.cpp +++ b/llvm/lib/Transforms/IPO/Attributor.cpp @@ -197,12 +197,14 @@ /// attribute list \p Attrs. This is only the case if it was not already present /// in \p Attrs at the position describe by \p PK and \p AttrIdx. static bool addIfNotExistent(LLVMContext &Ctx, const Attribute &Attr, - AttributeList &Attrs, int AttrIdx) { + AttributeList &Attrs, int AttrIdx, + bool ForceReplace = false) { if (Attr.isEnumAttribute()) { Attribute::AttrKind Kind = Attr.getKindAsEnum(); if (Attrs.hasAttribute(AttrIdx, Kind)) - if (isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind))) + if (!ForceReplace && + isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind))) return false; Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr); return true; @@ -210,7 +212,8 @@ if (Attr.isStringAttribute()) { StringRef Kind = Attr.getKindAsString(); if (Attrs.hasAttribute(AttrIdx, Kind)) - if (isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind))) + if (!ForceReplace && + isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind))) return false; Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr); return true; @@ -218,7 +221,8 @@ if (Attr.isIntAttribute()) { Attribute::AttrKind Kind = Attr.getKindAsEnum(); if (Attrs.hasAttribute(AttrIdx, Kind)) - if (isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind))) + if (!ForceReplace && + isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind))) return false; Attrs = Attrs.removeAttribute(Ctx, AttrIdx, Kind); Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr); @@ -299,7 +303,8 @@ ChangeStatus IRAttributeManifest::manifestAttrs(Attributor &A, const IRPosition &IRP, - const ArrayRef &DeducedAttrs) { + const ArrayRef &DeducedAttrs, + bool ForceReplace) { Function *ScopeFn = IRP.getAnchorScope(); IRPosition::Kind PK = IRP.getPositionKind(); @@ -327,7 +332,7 @@ ChangeStatus HasChanged = ChangeStatus::UNCHANGED; LLVMContext &Ctx = IRP.getAnchorValue().getContext(); for (const Attribute &Attr : DeducedAttrs) { - if (!addIfNotExistent(Ctx, Attr, Attrs, IRP.getAttrIdx())) + if (!addIfNotExistent(Ctx, Attr, Attrs, IRP.getAttrIdx(), ForceReplace)) continue; HasChanged = ChangeStatus::CHANGED; @@ -760,6 +765,22 @@ return false; } +bool Attributor::isAssumedDead(const BasicBlock &BB, + const AbstractAttribute *QueryingAA, + const AAIsDead *FnLivenessAA, + DepClassTy DepClass) { + if (!FnLivenessAA) + FnLivenessAA = lookupAAFor(IRPosition::function(*BB.getParent()), + QueryingAA, DepClassTy::NONE); + if (FnLivenessAA->isAssumedDead(&BB)) { + if (QueryingAA) + recordDependence(*FnLivenessAA, *QueryingAA, DepClass); + return true; + } + + return false; +} + bool Attributor::checkForAllUses(function_ref Pred, const AbstractAttribute &QueryingAA, const Value &V, DepClassTy LivenessDepClass) { @@ -2032,6 +2053,8 @@ // The alignment of a pointer is interesting for loads. case Instruction::Store: // The alignment of a pointer is interesting for stores. + case Instruction::Alloca: + case Instruction::AddrSpaceCast: IsInterestingOpcode = true; } if (IsInterestingOpcode) { diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll --- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll @@ -1,4 +1,5 @@ -; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefix=HSA %s +; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefixes=HSA,AKF_HSA %s +; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-attributor < %s | FileCheck -check-prefixes=HSA,ATTRIBUTOR_HSA %s declare void @llvm.memcpy.p1i32.p4i32.i32(i32 addrspace(1)* nocapture, i32 addrspace(4)* nocapture, i32, i1) #0 @@ -98,9 +99,15 @@ ret i32 addrspace(3)* addrspacecast (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i32 addrspace(3)*) } -; HSA: attributes #0 = { argmemonly nofree nounwind willreturn } -; HSA: attributes #1 = { nounwind } -; HSA: attributes #2 = { nounwind "amdgpu-queue-ptr" } +; Attributor assumes for kernels uniform-work-group-size false if it is not present. + +; ATTRIBUTOR_HSA: attributes #0 = { argmemonly nofree nounwind willreturn "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #1 = { nounwind "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #2 = { nounwind "amdgpu-queue-ptr" "uniform-work-group-size"="false" } + +; AKF_HSA: attributes #0 = { argmemonly nofree nounwind willreturn } +; AKF_HSA: attributes #1 = { nounwind } +; AKF_HSA: attributes #2 = { nounwind "amdgpu-queue-ptr" } attributes #0 = { argmemonly nounwind } attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll @@ -1,4 +1,6 @@ -; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=HSA %s +; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefixes=HSA,AKF_HSA %s +; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-attributor < %s | FileCheck -check-prefixes=HSA,ATTRIBUTOR_HSA %s + declare i32 @llvm.amdgcn.workgroup.id.x() #0 declare i32 @llvm.amdgcn.workgroup.id.y() #0 @@ -170,7 +172,9 @@ ret void } -; HSA: define void @recursive_use_workitem_id_y() #2 { +; Attributor sets uniform-work-group-size +; ATTRIBUTOR_HSA: define void @recursive_use_workitem_id_y() #12 { +; AKF_HSA: define void @recursive_use_workitem_id_y() #2 { define void @recursive_use_workitem_id_y() #1 { %val = call i32 @llvm.amdgcn.workitem.id.y() store volatile i32 %val, i32 addrspace(1)* undef @@ -191,14 +195,15 @@ ret void } -; HSA: define void @use_group_to_flat_addrspacecast_gfx9(i32 addrspace(3)* %ptr) #12 { + +; HSA: define void @use_group_to_flat_addrspacecast_gfx9(i32 addrspace(3)* %ptr) #[[SHIFTED1:[0-9]+]] { define void @use_group_to_flat_addrspacecast_gfx9(i32 addrspace(3)* %ptr) #2 { %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)* store volatile i32 0, i32 addrspace(4)* %stof ret void } -; HSA: define void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(i32 addrspace(3)* %ptr) #13 { +; HSA: define void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(i32 addrspace(3)* %ptr) #[[SHIFTED2:[0-9]+]] { define void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(i32 addrspace(3)* %ptr) #2 { %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)* store volatile i32 0, i32 addrspace(4)* %stof @@ -212,7 +217,8 @@ ret void } -; HSA: define void @indirect_use_group_to_flat_addrspacecast_gfx9() #11 { +; AKF_HSA: define void @indirect_use_group_to_flat_addrspacecast_gfx9() #11 { +; ATTRIBUTOR_HSA: define void @indirect_use_group_to_flat_addrspacecast_gfx9() #15 { define void @indirect_use_group_to_flat_addrspacecast_gfx9() #1 { call void @use_group_to_flat_addrspacecast_gfx9(i32 addrspace(3)* null) ret void @@ -224,85 +230,96 @@ ret void } -; HSA: define void @use_kernarg_segment_ptr() #14 { +; HSA: define void @use_kernarg_segment_ptr() #[[SHIFTED3:[0-9]+]] { define void @use_kernarg_segment_ptr() #1 { %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() store volatile i8 addrspace(4)* %kernarg.segment.ptr, i8 addrspace(4)* addrspace(1)* undef ret void } -; HSA: define void @func_indirect_use_kernarg_segment_ptr() #11 { +; AKF_HSA: define void @func_indirect_use_kernarg_segment_ptr() #11 { +; ATTRIBUTOR_HSA: define void @func_indirect_use_kernarg_segment_ptr() #15 { define void @func_indirect_use_kernarg_segment_ptr() #1 { call void @use_kernarg_segment_ptr() ret void } -; HSA: define amdgpu_kernel void @kern_use_implicitarg_ptr() #15 { + +; HSA: define amdgpu_kernel void @kern_use_implicitarg_ptr() #[[SHIFTED4:[0-9]+]] { define amdgpu_kernel void @kern_use_implicitarg_ptr() #1 { %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() store volatile i8 addrspace(4)* %implicitarg.ptr, i8 addrspace(4)* addrspace(1)* undef ret void } -; HSA: define void @use_implicitarg_ptr() #16 { + +; AKF_HSA: define void @use_implicitarg_ptr() #16 { +; ATTRIBUTOR_HSA: define void @use_implicitarg_ptr() #17 { define void @use_implicitarg_ptr() #1 { %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() store volatile i8 addrspace(4)* %implicitarg.ptr, i8 addrspace(4)* addrspace(1)* undef ret void } -; HSA: define void @func_indirect_use_implicitarg_ptr() #16 { + +; Attributor sets uniform-workgroup-size attribute. +; AKF_HSA: define void @func_indirect_use_implicitarg_ptr() #16 { +; ATTRIBUTOR_HSA: define void @func_indirect_use_implicitarg_ptr() #17 { define void @func_indirect_use_implicitarg_ptr() #1 { call void @use_implicitarg_ptr() ret void } -; HSA: declare void @external.func() #17 +; AKF_HSA: declare void @external.func() #[[SHIFTED5:[0-9]+]] declare void @external.func() #3 -; HSA: define internal void @defined.func() #17 { +; This function gets deleted. +; AKF_HSA: define internal void @defined.func() #17 { define internal void @defined.func() #3 { ret void } -; HSA: define void @func_call_external() #17 { +; HSA: define void @func_call_external() #[[SHIFTED5:[0-9]+]] { define void @func_call_external() #3 { call void @external.func() ret void } -; HSA: define void @func_call_defined() #17 { +; HSA: define void @func_call_defined() #[[SHIFTED5]] { define void @func_call_defined() #3 { call void @defined.func() ret void } -; HSA: define void @func_call_asm() #18 { +;FIXME: Investigate +; AKF_HSA: define void @func_call_asm() #18 { +; ATTRIBUTOR_HSA: define void @func_call_asm() #19 { define void @func_call_asm() #3 { call void asm sideeffect "", ""() #3 ret void } -; HSA: define amdgpu_kernel void @kern_call_external() #19 { +; HSA: define amdgpu_kernel void @kern_call_external() #[[SHIFTED6:[0-9]+]] { define amdgpu_kernel void @kern_call_external() #3 { call void @external.func() ret void } -; HSA: define amdgpu_kernel void @func_kern_defined() #19 { +; AKF_HSA: define amdgpu_kernel void @func_kern_defined() #19 { +; ATTRIBUTOR_HSA: define amdgpu_kernel void @func_kern_defined() #18 { define amdgpu_kernel void @func_kern_defined() #3 { call void @defined.func() ret void } -; HSA: define i32 @use_dispatch_ptr_ret_type() #20 { +; HSA: define i32 @use_dispatch_ptr_ret_type() #[[SHIFTED7:[0-9]*]] { define i32 @use_dispatch_ptr_ret_type() #1 { %dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() store volatile i8 addrspace(4)* %dispatch.ptr, i8 addrspace(4)* addrspace(1)* undef ret i32 0 } -; HSA: define float @func_indirect_use_dispatch_ptr_constexpr_cast_func() #20 { +; HSA: define float @func_indirect_use_dispatch_ptr_constexpr_cast_func() #[[SHIFTED7]] { define float @func_indirect_use_dispatch_ptr_constexpr_cast_func() #1 { %f = call float bitcast (i32()* @use_dispatch_ptr_ret_type to float()*)() %fadd = fadd float %f, 1.0 @@ -314,7 +331,9 @@ attributes #2 = { nounwind "target-cpu"="gfx900" } attributes #3 = { nounwind } -; HSA: attributes #0 = { nounwind readnone speculatable willreturn } +; AKF_HSA: attributes #0 = { nounwind readnone speculatable willreturn } +; ATTRIBUTOR_HSA: attributes #0 = { nounwind readnone speculatable willreturn "uniform-work-group-size"="false" } + ; HSA: attributes #1 = { nounwind "amdgpu-work-item-id-x" "target-cpu"="fiji" "uniform-work-group-size"="false" } ; HSA: attributes #2 = { nounwind "amdgpu-work-item-id-y" "target-cpu"="fiji" "uniform-work-group-size"="false" } ; HSA: attributes #3 = { nounwind "amdgpu-work-item-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } @@ -324,14 +343,30 @@ ; HSA: attributes #7 = { nounwind "amdgpu-dispatch-ptr" "target-cpu"="fiji" "uniform-work-group-size"="false" } ; HSA: attributes #8 = { nounwind "amdgpu-queue-ptr" "target-cpu"="fiji" "uniform-work-group-size"="false" } ; HSA: attributes #9 = { nounwind "amdgpu-dispatch-id" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; HSA: attributes #10 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "target-cpu"="fiji" } -; HSA: attributes #11 = { nounwind "target-cpu"="fiji" "uniform-work-group-size"="false" } -; HSA: attributes #12 = { nounwind "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; HSA: attributes #13 = { nounwind "amdgpu-queue-ptr" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; HSA: attributes #14 = { nounwind "amdgpu-kernarg-segment-ptr" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; HSA: attributes #15 = { nounwind "amdgpu-implicitarg-ptr" "target-cpu"="fiji" } -; HSA: attributes #16 = { nounwind "amdgpu-implicitarg-ptr" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; HSA: attributes #17 = { nounwind "uniform-work-group-size"="false" } -; HSA: attributes #18 = { nounwind } -; HSA: attributes #19 = { nounwind "amdgpu-calls" "uniform-work-group-size"="false" } -; HSA: attributes #20 = { nounwind "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-queue-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" "target-cpu"="fiji" } + +; AKF_HSA: attributes #10 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "target-cpu"="fiji" } +; ATTRIBUTOR_HSA: attributes #10 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } + +; AKF_HSA: attributes #11 = { nounwind "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #11 = { noreturn nounwind readnone "target-cpu"="fiji" "uniform-work-group-size"="false" } +; Attribute numbers shifted from now on, because of this attribute. +; ATTRIBUTOR_HSA: attributes #12 = { noreturn nounwind "amdgpu-work-item-id-y" "target-cpu"="fiji" "uniform-work-group-size"="false" } + +; HSA: attributes #[[SHIFTED1]] = { nounwind "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; HSA: attributes #[[SHIFTED2]] = { nounwind "amdgpu-queue-ptr" "target-cpu"="gfx900" "uniform-work-group-size"="false" } + +; Attributes shift more +; ATTRIBUTOR_HSA: attributes #15 = { nounwind "target-cpu"="fiji" "uniform-work-group-size"="false" } +; HSA: attributes #[[SHIFTED3]] = { nounwind "amdgpu-kernarg-segment-ptr" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; AKF_HSA: attributes #[[SHIFTED4]] = { nounwind "amdgpu-implicitarg-ptr" "target-cpu"="fiji" } +; ATTRIBUTOR_HSA: attributes #[[SHIFTED4]] = { nounwind "amdgpu-implicitarg-ptr" "target-cpu"="fiji" "uniform-work-group-size"="false" } + +; This is the same as Attributor #17 +; AKF_HSA: attributes #16 = { nounwind "amdgpu-implicitarg-ptr" "target-cpu"="fiji" "uniform-work-group-size"="false" } + +; HSA: attributes #[[SHIFTED5]] = { nounwind "uniform-work-group-size"="false" } +; HSA: attributes #[[SHIFTED6]] = { nounwind "amdgpu-calls" "uniform-work-group-size"="false" } + +; AKF_HSA: attributes #[[SHIFTED7]] = { nounwind "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-queue-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" "target-cpu"="fiji" } +; ATTRIBUTOR_AKF: attributes #[[SHIFTED7]] = { nounwind "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-queue-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } + diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll @@ -1,4 +1,5 @@ -; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefix=HSA %s +; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefixes=HSA,AKF_HSA %s +; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-attributor < %s | FileCheck -check-prefixes=HSA,ATTRIBUTOR_HSA %s target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" @@ -280,17 +281,34 @@ attributes #0 = { nounwind readnone speculatable } attributes #1 = { nounwind } -; HSA: attributes #0 = { nounwind readnone speculatable willreturn } -; HSA: attributes #1 = { nounwind } -; HSA: attributes #2 = { nounwind "amdgpu-work-group-id-y" } -; HSA: attributes #3 = { nounwind "amdgpu-work-group-id-z" } -; HSA: attributes #4 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" } -; HSA: attributes #5 = { nounwind "amdgpu-work-item-id-y" } -; HSA: attributes #6 = { nounwind "amdgpu-work-item-id-z" } -; HSA: attributes #7 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-item-id-y" } -; HSA: attributes #8 = { nounwind "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" } -; HSA: attributes #9 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" } -; HSA: attributes #10 = { nounwind "amdgpu-dispatch-ptr" } -; HSA: attributes #11 = { nounwind "amdgpu-queue-ptr" } -; HSA: attributes #12 = { nounwind "amdgpu-kernarg-segment-ptr" } -; HSA: attributes #13 = { nounwind "amdgpu-stack-objects" } +; AKF_HSA: attributes #0 = { nounwind readnone speculatable willreturn } +; AKF_HSA: attributes #1 = { nounwind } +; AKF_HSA: attributes #2 = { nounwind "amdgpu-work-group-id-y" } +; AKF_HSA: attributes #3 = { nounwind "amdgpu-work-group-id-z" } +; AKF_HSA: attributes #4 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" } +; AKF_HSA: attributes #5 = { nounwind "amdgpu-work-item-id-y" } +; AKF_HSA: attributes #6 = { nounwind "amdgpu-work-item-id-z" } +; AKF_HSA: attributes #7 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-item-id-y" } +; AKF_HSA: attributes #8 = { nounwind "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" } +; AKF_HSA: attributes #9 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" } +; AKF_HSA: attributes #10 = { nounwind "amdgpu-dispatch-ptr" } +; AKF_HSA: attributes #11 = { nounwind "amdgpu-queue-ptr" } +; AKF_HSA: attributes #12 = { nounwind "amdgpu-kernarg-segment-ptr" } +; AKF_HSA: attributes #13 = { nounwind "amdgpu-stack-objects" } + + +; ATTRIBUTOR_HSA: attributes #0 = { nounwind readnone speculatable willreturn "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #1 = { nounwind "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #2 = { nounwind "amdgpu-work-group-id-y" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #3 = { nounwind "amdgpu-work-group-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #4 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #5 = { nounwind "amdgpu-work-item-id-y" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #6 = { nounwind "amdgpu-work-item-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #7 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-item-id-y" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #8 = { nounwind "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR-HSA: attributes #9 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #10 = { nounwind "amdgpu-dispatch-ptr" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #11 = { nounwind "amdgpu-queue-ptr" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #12 = { nounwind "amdgpu-kernarg-segment-ptr" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #13 = { nounwind "amdgpu-stack-objects" "uniform-work-group-size"="false" } + diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll @@ -1,4 +1,5 @@ ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefix=NOHSA -check-prefix=ALL %s +; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-attributor < %s | FileCheck -check-prefix=NOHSA -check-prefix=ALL %s declare i32 @llvm.r600.read.tgid.x() #0 declare i32 @llvm.r600.read.tgid.y() #0 diff --git a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll --- a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll @@ -1,4 +1,5 @@ ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=GCN %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-attributor %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: define internal void @indirect() #0 { define internal void @indirect() { diff --git a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll --- a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll +++ b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll @@ -1,4 +1,5 @@ ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=GCN %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-attributor %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: define internal void @indirect() #0 { define internal void @indirect() { diff --git a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll --- a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; Check that no attributes are added to graphics functions -; RUN: opt -S -mtriple=amdgcn-amd-amdpal -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=GCN %s +; RUN: opt -S -mtriple=amdgcn-amd-amdpal -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=AKF_GCN %s +; RUN: opt -S -mtriple=amdgcn-amd-amdpal -amdgpu-attributor %s | FileCheck -check-prefixes=ATTRIBUTOR_GCN %s ; Check that it doesn't crash ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s @@ -9,8 +10,11 @@ target datalayout = "A5" + define amdgpu_cs void @test_simple_indirect_call() { -; GCN-LABEL: define amdgpu_cs void @test_simple_indirect_call() { +; AKF_GCN-LABEL: define amdgpu_cs void @test_simple_indirect_call() { +; Attributor adds work-group-size attribute. This should be ok. +; ATTRIBUTOR_GCN-LABEL: define amdgpu_cs void @test_simple_indirect_call() #0 { ; GFX9-LABEL: test_simple_indirect_call: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_getpc_b64 s[36:37] @@ -53,3 +57,7 @@ declare i64 @llvm.amdgcn.s.getpc() #0 attributes #0 = { nounwind readnone speculatable willreturn } + +; ATTRIBUTOR_GCN: attributes #0 = { "uniform-work-group-size"="false" } +; ATTRIBUTOR_GCN: attributes #1 = { nounwind readnone speculatable willreturn "uniform-work-group-size"="false" } + diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll @@ -1,4 +1,5 @@ ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=GCN %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-attributor %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s @@ -47,5 +48,6 @@ ret void } +;FIXME: The AMDGPU Attributor does not deduce the uniform-group-size attribute. ; attributes #0 = { "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" } ; attributes #1 = { "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-stack-objects" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll @@ -1,9 +1,14 @@ ; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-attributor %s | FileCheck %s ; If the kernel does not have the uniform-work-group-attribute, set both callee and caller as false -; CHECK: define void @foo() #[[FOO:[0-9]+]] { +; sink function is added to prevent attributor from deleting the functions. +declare void @sink() + +; CHECK: define void @foo() #[[FOO:[0-9]+]] define void @foo() #0 { + call void @sink() ret void } diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll @@ -1,9 +1,15 @@ + ; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-attributor %s | FileCheck %s ; Test to verify if the attribute gets propagated across nested function calls -; CHECK: define void @func1() #[[FUNC:[0-9]+]] { +; Added to prevent Attributor from deleting calls. +declare void @sink() + +; CHECK: define void @func1() #[[FUNC:[0-9]+]] define void @func1() #0 { + call void @sink() ret void } @@ -20,6 +26,5 @@ } attributes #2 = { "uniform-work-group-size"="true" } - ; CHECK: attributes #[[FUNC]] = { "uniform-work-group-size"="true" } ; CHECK: attributes #[[KERNEL]] = { "amdgpu-calls" "uniform-work-group-size"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll @@ -1,9 +1,14 @@ ; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-attributor %s | FileCheck %s + +; Function added to prevent attributor from deleting call sites. +declare void @sink() ; Two kernels with different values of the uniform-work-group-attribute call the same function ; CHECK: define void @func() #[[FUNC:[0-9]+]] { define void @func() #0 { + call void @sink() ret void } diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll @@ -1,8 +1,13 @@ ; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-attributor %s | FileCheck %s + +; function added to prevent attributor from deleting calls. +declare void @sink() ; Propagate the uniform-work-group-attribute from the kernel to callee if it doesn't have it ; CHECK: define void @func() #[[FUNC:[0-9]+]] { define void @func() #0 { + call void @sink() ret void } @@ -15,6 +20,7 @@ ; External declaration of a function ; CHECK: define weak_odr void @weak_func() #[[FUNC]] { define weak_odr void @weak_func() #0 { + call void @sink() ret void } diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll @@ -1,4 +1,6 @@ ; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-annotate-kernel-features %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-attributor %s | FileCheck %s + ; Test to ensure recursive functions exhibit proper behaviour ; Test to generate fibonacci numbers @@ -32,7 +34,9 @@ ret void } +; nounwind and readnone are added to match attributor results. +attributes #0 = { nounwind readnone } attributes #1 = { "uniform-work-group-size"="true" } -; CHECK: attributes #[[FIB]] = { "uniform-work-group-size"="true" } +; CHECK: attributes #[[FIB]] = { nounwind readnone "uniform-work-group-size"="true" } ; CHECK: attributes #[[KERNEL]] = { "amdgpu-calls" "uniform-work-group-size"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll @@ -1,29 +1,27 @@ ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-attributor %s | FileCheck %s -; CHECK: define void @func1() #[[FUNC:[0-9]+]] { -define void @func1() { - ret void -} -; CHECK: define void @func4() #[[FUNC]] { -define void @func4() { - ret void -} +; CHECK: declare void @func1() #[[FUNC0:[0-9]+]] +declare void @func1() + +; CHECK: declare void @func4() #[[FUNC0]] +declare void @func4() -; CHECK: define void @func2() #[[FUNC]] { +; CHECK: define void @func2() #[[FUNC0]] { define void @func2() #0 { call void @func4() call void @func1() ret void } -; CHECK: define void @func3() #[[FUNC]] { +; CHECK: define void @func3() #[[FUNC0]] { define void @func3() { call void @func1() ret void } -; CHECK: define amdgpu_kernel void @kernel3() #[[FUNC:[0-9]+]] { +; CHECK: define amdgpu_kernel void @kernel3() #[[FUNC1:[0-9]+]] { define amdgpu_kernel void @kernel3() #0 { call void @func2() call void @func3() @@ -32,4 +30,5 @@ attributes #0 = { "uniform-work-group-size"="false" } -; CHECK: attributes #[[FUNC]] = { "amdgpu-calls" "uniform-work-group-size"="false" } +; CHECK: attributes #[[FUNC0]] = { "uniform-work-group-size"="false" } +; CHECK: attributes #[[FUNC1]] = { "amdgpu-calls" "uniform-work-group-size"="false" }