Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -58,6 +58,8 @@ FunctionPass *createAMDGPUUseNativeCallsPass(); FunctionPass *createAMDGPUCodeGenPreparePass(); FunctionPass *createAMDGPUMachineCFGStructurizerPass(); +FunctionPass *createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *); +ModulePass *createAMDGPUPropagateAttributesLatePass(const TargetMachine *); FunctionPass *createAMDGPURewriteOutArgumentsPass(); FunctionPass *createSIModeRegisterPass(); @@ -92,6 +94,12 @@ void initializeAMDGPULowerKernelAttributesPass(PassRegistry &); extern char &AMDGPULowerKernelAttributesID; +void initializeAMDGPUPropagateAttributesEarlyPass(PassRegistry &); +extern char &AMDGPUPropagateAttributesEarlyID; + +void initializeAMDGPUPropagateAttributesLatePass(PassRegistry &); +extern char &AMDGPUPropagateAttributesLateID; + void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &); extern char &AMDGPURewriteOutArgumentsID; Index: lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp @@ -0,0 +1,336 @@ +//===--- AMDGPUPropagateAttributes.cpp --------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This pass propagates attributes from kernels to the non-entry +/// functions. Most of the library functions were not compiled for specific ABI, +/// yet will be correctly compiled if proper attrbutes are propagated from the +/// caller. +/// +/// The pass analyzes call graph and propagates ABI target features through the +/// call graph. +/// +/// It can run in two modes: as a function or module pass. A function pass +/// simply propagates attributes. A module pass clones functions if there are +/// callers with different ABI. If a function is clonned all call sites will +/// be updated to use a correct clone. +/// +/// A function pass is limited in functionality but can run early in the +/// pipeline. A module pass is more powerful but has to run late, so misses +/// library folding opportunities. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "amdgpu-propagate-attributes" + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include + +using namespace llvm; + +namespace llvm { +extern const SubtargetFeatureKV AMDGPUFeatureKV[AMDGPU::NumSubtargetFeatures-1]; +} + +namespace { + +class AMDGPUPropagateAttributes { + const FeatureBitset TargetFeatures = { + AMDGPU::FeatureWavefrontSize16, + AMDGPU::FeatureWavefrontSize32, + AMDGPU::FeatureWavefrontSize64 + }; + + class Clone{ + public: + Clone(FeatureBitset FeatureMask, Function *OrigF, Function *NewF) : + FeatureMask(FeatureMask), OrigF(OrigF), NewF(NewF) {} + + FeatureBitset FeatureMask; + Function *OrigF; + Function *NewF; + }; + + const TargetMachine *TM; + + // Clone functions as needed or just set attributes. + bool AllowClone; + + // Option propagation roots. + SmallSet Roots; + + // Clones of functions with their attributes. + SmallVector Clones; + + // Find a clone with required features. + Function *findFunction(const FeatureBitset &FeaturesNeeded, + Function *OrigF); + + // Clone function F and set NewFeatures on the clone. + // Cole takes the name of original function. + Function *cloneWithFeatures(Function &F, + const FeatureBitset &NewFeatures); + + // Set new function's features in place. + void setFeatures(Function &F, const FeatureBitset &NewFeatures); + + std::string getFeatureString(const FeatureBitset &Features) const; + + // Propagate attributes from Roots. + bool process(); + +public: + AMDGPUPropagateAttributes(const TargetMachine *TM, bool AllowClone) : + TM(TM), AllowClone(AllowClone) {} + + // Use F as a root and propagate its attributes. + bool process(Function &F); + + // Propagate attributes starting from kernel functions. + bool process(Module &M); +}; + +// Allows to propagate attributes early, but no clonning is allowed as it must +// be a function pass to run before any optimizations. +// TODO: We shall only need a one instance of module pass, but that needs to be +// in the linker pipeline which is currently not possible. +class AMDGPUPropagateAttributesEarly : public FunctionPass { + const TargetMachine *TM; + +public: + static char ID; // Pass identification + + AMDGPUPropagateAttributesEarly(const TargetMachine *TM = nullptr) : + FunctionPass(ID), TM(TM) { + initializeAMDGPUPropagateAttributesEarlyPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; +}; + +// Allows to propagate attributes with clonning but does that late in the +// pipeline. +class AMDGPUPropagateAttributesLate : public ModulePass { + const TargetMachine *TM; + +public: + static char ID; // Pass identification + + AMDGPUPropagateAttributesLate(const TargetMachine *TM = nullptr) : + ModulePass(ID), TM(TM) { + initializeAMDGPUPropagateAttributesLatePass( + *PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override; +}; + +} // end anonymous namespace. + +char AMDGPUPropagateAttributesEarly::ID = 0; +char AMDGPUPropagateAttributesLate::ID = 0; + +INITIALIZE_PASS(AMDGPUPropagateAttributesEarly, + "amdgpu-propagate-attributes-early", + "Early propagate attributes from kernels to functions", + false, false) +INITIALIZE_PASS(AMDGPUPropagateAttributesLate, + "amdgpu-propagate-attributes-late", + "Late propagate attributes from kernels to functions", + false, false) + +Function * +AMDGPUPropagateAttributes::findFunction(const FeatureBitset &FeaturesNeeded, + Function *OrigF) { + // TODO: search for clone's clones. + for (Clone &C : Clones) + if (C.OrigF == OrigF && FeaturesNeeded == C.FeatureMask) + return C.NewF; + + return nullptr; +} + +bool AMDGPUPropagateAttributes::process(Module &M) { + for (auto &F : M.functions()) + if (AMDGPU::isEntryFunctionCC(F.getCallingConv())) + Roots.insert(&F); + + return process(); +} + +bool AMDGPUPropagateAttributes::process(Function &F) { + Roots.insert(&F); + return process(); +} + +bool AMDGPUPropagateAttributes::process() { + bool Changed = false; + SmallSet NewRoots; + SmallSet Replaced; + + if (Roots.empty()) + return false; + Module &M = *(*Roots.begin())->getParent(); + + do { + Roots.insert(NewRoots.begin(), NewRoots.end()); + NewRoots.clear(); + + for (auto &F : M.functions()) { + if (F.isDeclaration() || Roots.count(&F) || Roots.count(&F)) + continue; + + const FeatureBitset &CalleeBits = + TM->getSubtargetImpl(F)->getFeatureBits(); + SmallVector, 32> ToReplace; + + for (User *U : F.users()) { + Instruction *I = dyn_cast(U); + if (!I) + continue; + CallBase *CI = dyn_cast(I); + if (!CI) + continue; + Function *Caller = CI->getCaller(); + if (!Caller) + continue; + if (!Roots.count(Caller)) + continue; + + const FeatureBitset &CallerBits = + TM->getSubtargetImpl(*Caller)->getFeatureBits() & TargetFeatures; + + if (CallerBits == (CalleeBits & TargetFeatures)) { + NewRoots.insert(&F); + continue; + } + + Function *NewF = findFunction(CallerBits, &F); + if (!NewF) { + FeatureBitset NewFeatures((CalleeBits & ~TargetFeatures) | + CallerBits); + if (!AllowClone) { + // This may set different features on different iteartions if + // there is a contradiction in callers' attributes. In this case + // we rely on a second pass running on Module, which is allowed + // to clone. + setFeatures(F, NewFeatures); + NewRoots.insert(&F); + Changed = true; + break; + } + + NewF = cloneWithFeatures(F, NewFeatures); + Clones.push_back(Clone(CallerBits, &F, NewF)); + NewRoots.insert(NewF); + } + + ToReplace.push_back(std::make_pair(CI, NewF)); + Replaced.insert(&F); + + Changed = true; + } + + while (!ToReplace.empty()) { + auto R = ToReplace.pop_back_val(); + R.first->setCalledFunction(R.second); + } + } + } while (!NewRoots.empty()); + + for (Function *F : Replaced) { + if (F->use_empty()) + F->eraseFromParent(); + } + + return Changed; +} + +Function * +AMDGPUPropagateAttributes::cloneWithFeatures(Function &F, + const FeatureBitset &NewFeatures) { + LLVM_DEBUG(dbgs() << "Cloning " << F.getName() << '\n'); + + ValueToValueMapTy dummy; + Function *NewF = CloneFunction(&F, dummy); + setFeatures(*NewF, NewFeatures); + + // Swap names. If that is the only clone it will retain the name of now + // dead value. + if (F.hasName()) { + std::string NewName = NewF->getName(); + NewF->takeName(&F); + F.setName(NewName); + + // Name has changed, it does not need an external symbol. + F.setVisibility(GlobalValue::DefaultVisibility); + F.setLinkage(GlobalValue::InternalLinkage); + } + + return NewF; +} + +void AMDGPUPropagateAttributes::setFeatures(Function &F, + const FeatureBitset &NewFeatures) { + std::string NewFeatureStr = getFeatureString(NewFeatures); + + LLVM_DEBUG(dbgs() << "Set features " + << getFeatureString(NewFeatures & TargetFeatures) + << " on " << F.getName() << '\n'); + + F.removeFnAttr("target-features"); + F.addFnAttr("target-features", NewFeatureStr); +} + +std::string +AMDGPUPropagateAttributes::getFeatureString(const FeatureBitset &Features) const +{ + std::string Ret; + for (const SubtargetFeatureKV &KV : AMDGPUFeatureKV) { + if (Features[KV.Value]) + Ret += (StringRef("+") + KV.Key + ",").str(); + else if (TargetFeatures[KV.Value]) + Ret += (StringRef("-") + KV.Key + ",").str(); + } + Ret.pop_back(); // Remove last comma. + return Ret; +} + +bool AMDGPUPropagateAttributesEarly::runOnFunction(Function &F) { + if (!TM || !AMDGPU::isEntryFunctionCC(F.getCallingConv())) + return false; + + return AMDGPUPropagateAttributes(TM, false).process(F); +} + +bool AMDGPUPropagateAttributesLate::runOnModule(Module &M) { + if (!TM) + return false; + + return AMDGPUPropagateAttributes(TM, true).process(M); +} + +FunctionPass +*llvm::createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *TM) { + return new AMDGPUPropagateAttributesEarly(TM); +} + +ModulePass +*llvm::createAMDGPUPropagateAttributesLatePass(const TargetMachine *TM) { + return new AMDGPUPropagateAttributesLate(TM); +} Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -217,6 +217,8 @@ initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); initializeAMDGPUPromoteAllocaPass(*PR); initializeAMDGPUCodeGenPreparePass(*PR); + initializeAMDGPUPropagateAttributesEarlyPass(*PR); + initializeAMDGPUPropagateAttributesLatePass(*PR); initializeAMDGPURewriteOutArgumentsPass(*PR); initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowPass(*PR); @@ -402,13 +404,14 @@ Builder.addExtension( PassManagerBuilder::EP_ModuleOptimizerEarly, - [Internalize, EarlyInline, AMDGPUAA](const PassManagerBuilder &, - legacy::PassManagerBase &PM) { + [Internalize, EarlyInline, AMDGPUAA, this](const PassManagerBuilder &, + legacy::PassManagerBase &PM) { if (AMDGPUAA) { PM.add(createAMDGPUAAWrapperPass()); PM.add(createAMDGPUExternalAAWrapperPass()); } PM.add(createAMDGPUUnifyMetadataPass()); + PM.add(createAMDGPUPropagateAttributesLatePass(this)); if (Internalize) { PM.add(createInternalizePass(mustPreserveGV)); PM.add(createGlobalDCEPass()); @@ -426,6 +429,7 @@ PM.add(createAMDGPUAAWrapperPass()); PM.add(createAMDGPUExternalAAWrapperPass()); } + PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this)); PM.add(llvm::createAMDGPUUseNativeCallsPass()); if (LibCallSimplify) PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt, this)); @@ -654,6 +658,9 @@ disablePass(&FuncletLayoutID); disablePass(&PatchableFunctionID); + // A call to propagate attributes pass in the backend in case opt was not run. + addPass(createAMDGPUPropagateAttributesEarlyPass(&TM)); + addPass(createAtomicExpandPass()); // This must occur before inlining, as the inliner will not look through Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -58,6 +58,7 @@ AMDGPUMCInstLower.cpp AMDGPUOpenCLEnqueuedBlockLowering.cpp AMDGPUPromoteAlloca.cpp + AMDGPUPropagateAttributes.cpp AMDGPURegAsmNames.inc.cpp AMDGPURegisterBankInfo.cpp AMDGPURegisterInfo.cpp Index: test/CodeGen/AMDGPU/propagate-attributes-clone.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/propagate-attributes-clone.ll @@ -0,0 +1,87 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -O1 < %s | FileCheck -check-prefix=OPT %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=LLC %s + +; OPT: declare void @foo4() local_unnamed_addr #0 +; OPT: define internal fastcc void @foo3.2() unnamed_addr #1 +; OPT: define void @foo2() local_unnamed_addr #1 +; OPT: define internal fastcc void @foo1.1() unnamed_addr #1 +; OPT: define amdgpu_kernel void @kernel1() local_unnamed_addr #2 +; OPT: define amdgpu_kernel void @kernel2() local_unnamed_addr #3 +; OPT: define amdgpu_kernel void @kernel3() local_unnamed_addr #3 +; OPT: define void @foo1() local_unnamed_addr #4 +; OPT: define void @foo3() local_unnamed_addr #4 +; OPT: attributes #0 = { {{.*}} "target-features"="+wavefrontsize64" } +; OPT: attributes #1 = { {{.*}} "target-features"="{{.*}},-wavefrontsize16,-wavefrontsize32,+wavefrontsize64{{.*}}" } +; OPT: attributes #2 = { {{.*}} "target-features"="+wavefrontsize32" } +; OPT: attributes #3 = { {{.*}} "target-features"="+wavefrontsize64" } +; OPT: attributes #4 = { {{.*}} "target-features"="{{.*}},-wavefrontsize16,+wavefrontsize32,-wavefrontsize64{{.*}}" } + +; LLC: foo3: +; LLC: sample asm +; LLC: foo2: +; LLC: sample asm +; LLC: foo1: +; LLC: foo4@gotpcrel32@lo+4 +; LLC: foo4@gotpcrel32@hi+4 +; LLC: foo3@gotpcrel32@lo+4 +; LLC: foo3@gotpcrel32@hi+4 +; LLC: foo2@gotpcrel32@lo+4 +; LLC: foo2@gotpcrel32@hi+4 +; LLC: foo1@gotpcrel32@lo+4 +; LLC: foo1@gotpcrel32@hi+4 +; LLC: kernel1: +; LLC: foo1@gotpcrel32@lo+4 +; LLC: foo1@gotpcrel32@hi+4 +; LLC: kernel2: +; LLC: foo2@gotpcrel32@lo+4 +; LLC: foo2@gotpcrel32@hi+4 +; LLC: kernel3: +; LLC: foo1@gotpcrel32@lo+4 +; LLC: foo1@gotpcrel32@hi+4 + +declare void @foo4() #1 + +define void @foo3() #1 { +entry: + call void asm sideeffect "; sample asm", ""() + ret void +} + +define void @foo2() #1 { +entry: + call void asm sideeffect "; sample asm", ""() + ret void +} + +define void @foo1() #1 { +entry: + tail call void @foo4() + tail call void @foo3() + tail call void @foo2() + tail call void @foo2() + tail call void @foo1() + ret void +} + +define amdgpu_kernel void @kernel1() #0 { +entry: + tail call void @foo1() + ret void +} + +define amdgpu_kernel void @kernel2() #2 { +entry: + tail call void @foo2() + ret void +} + +define amdgpu_kernel void @kernel3() #3 { +entry: + tail call void @foo1() + ret void +} + +attributes #0 = { nounwind "target-features"="+wavefrontsize32" } +attributes #1 = { noinline nounwind "target-features"="+wavefrontsize64" } +attributes #2 = { nounwind "target-features"="+wavefrontsize64" } +attributes #3 = { nounwind "target-features"="+wavefrontsize64" } Index: test/CodeGen/AMDGPU/propagate-attributes-single-set.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/propagate-attributes-single-set.ll @@ -0,0 +1,72 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -O1 < %s | FileCheck -check-prefix=OPT %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=LLC %s + +; OPT: declare void @foo4() local_unnamed_addr #0 +; OPT: define void @foo3() local_unnamed_addr #1 +; OPT: define void @foo2() local_unnamed_addr #1 +; OPT: define void @foo1() local_unnamed_addr #1 +; OPT: define amdgpu_kernel void @kernel1() local_unnamed_addr #2 +; OPT: define amdgpu_kernel void @kernel2() local_unnamed_addr #2 +; OPT: attributes #0 = { {{.*}} "target-features"="+wavefrontsize64" } +; OPT: attributes #1 = { {{.*}} "target-features"="{{.*}},-wavefrontsize16,+wavefrontsize32,-wavefrontsize64 +; OPT: attributes #2 = { {{.*}} "target-features"="+wavefrontsize32 +; OPT: attributes #3 = { nounwind } + +; LLC: foo3: +; LLC: sample asm +; LLC: foo2: +; LLC: sample asm +; LLC: foo1: +; LLC: foo4@gotpcrel32@lo+4 +; LLC: foo4@gotpcrel32@hi+4 +; LLC: foo3@gotpcrel32@lo+4 +; LLC: foo3@gotpcrel32@hi+4 +; LLC: foo2@gotpcrel32@lo+4 +; LLC: foo2@gotpcrel32@hi+4 +; LLC: foo1@gotpcrel32@lo+4 +; LLC: foo1@gotpcrel32@hi+4 +; LLC: kernel1: +; LLC: foo1@gotpcrel32@lo+4 +; LLC: foo1@gotpcrel32@hi+4 +; LLC: kernel2: +; LLC: foo2@gotpcrel32@lo+4 +; LLC: foo2@gotpcrel32@hi+4 + +declare void @foo4() #1 + +define void @foo3() #1 { +entry: + call void asm sideeffect "; sample asm", ""() + ret void +} + +define void @foo2() #1 { +entry: + call void asm sideeffect "; sample asm", ""() + ret void +} + +define void @foo1() #1 { +entry: + tail call void @foo4() + tail call void @foo3() + tail call void @foo2() + tail call void @foo2() + tail call void @foo1() + ret void +} + +define amdgpu_kernel void @kernel1() #0 { +entry: + tail call void @foo1() + ret void +} + +define amdgpu_kernel void @kernel2() #0 { +entry: + tail call void @foo2() + ret void +} + +attributes #0 = { nounwind "target-features"="+wavefrontsize32" } +attributes #1 = { noinline nounwind "target-features"="+wavefrontsize64" }