Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -49,6 +49,10 @@ FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); FunctionPass *createSIInsertWaits(TargetMachine &tm); +ModulePass *createAMDGPUAnnotateKernelFeaturesPass(); +void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); +extern char &AMDGPUAnnotateKernelFeaturesID; + void initializeSIFoldOperandsPass(PassRegistry &); extern char &SIFoldOperandsID; Index: lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -0,0 +1,125 @@ +//===-- AMDGPUAnnotateKernelFeaturesPass.cpp ------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file This pass adds target attributes to functions which use intrinsics +/// which will impact calling convention lowering. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" + +#define DEBUG_TYPE "amdgpu-annotate-kernel-features" + +using namespace llvm; + +namespace { + +class AMDGPUAnnotateKernelFeatures : public ModulePass { +private: + void addAttrToCallers(Function *Intrin, StringRef AttrName); + bool addAttrsForIntrinsics(Module &M, ArrayRef); + +public: + static char ID; + + AMDGPUAnnotateKernelFeatures() : ModulePass(ID) { } + bool runOnModule(Module &M) override; + const char *getPassName() const override { + return "AMDGPU Annotate Kernel Features"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + ModulePass::getAnalysisUsage(AU); + } +}; + +} + +char AMDGPUAnnotateKernelFeatures::ID = 0; + +char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID; + + +INITIALIZE_PASS_BEGIN(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, + "Add AMDGPU function attributes", false, false) +INITIALIZE_PASS_END(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, + "Add AMDGPU function attributes", false, false) + + +void AMDGPUAnnotateKernelFeatures::addAttrToCallers(Function *Intrin, + StringRef AttrName) { + SmallPtrSet SeenFuncs; + + for (User *U : Intrin->users()) { + // CallInst is the only valid user for an intrinsic. + CallInst *CI = cast(U); + + Function *CallingFunction = CI->getParent()->getParent(); + if (SeenFuncs.insert(CallingFunction).second) + CallingFunction->addFnAttr(AttrName); + } +} + +bool AMDGPUAnnotateKernelFeatures::addAttrsForIntrinsics( + Module &M, + ArrayRef IntrinsicToAttr) { + bool Changed = false; + + for (const StringRef *Arr : IntrinsicToAttr) { + if (Function *Fn = M.getFunction(Arr[0])) { + addAttrToCallers(Fn, Arr[1]); + Changed = true; + } + } + + return Changed; +} + +bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) { + Triple TT(M.getTargetTriple()); + + static const StringRef IntrinsicToAttr[][2] = { + // .x omitted + { "llvm.r600.read.tgid.y", "amdgpu-work-group-id-y" }, + { "llvm.r600.read.tgid.z", "amdgpu-work-group-id-z" }, + + // .x omitted + { "llvm.r600.read.tidig.y", "amdgpu-work-item-id-y" }, + { "llvm.r600.read.tidig.z", "amdgpu-work-item-id-z" } + + }; + + static const StringRef HSAIntrinsicToAttr[][2] = { + { "llvm.r600.read.local.size.x", "amdgpu-dispatch-ptr" }, + { "llvm.r600.read.local.size.y", "amdgpu-dispatch-ptr" }, + { "llvm.r600.read.local.size.z", "amdgpu-dispatch-ptr" }, + + { "llvm.r600.read.global.size.x", "amdgpu-dispatch-ptr" }, + { "llvm.r600.read.global.size.y", "amdgpu-dispatch-ptr" }, + { "llvm.r600.read.global.size.z", "amdgpu-dispatch-ptr" } + }; + + // TODO: Intrinsics that require queue ptr. + + // We do not need to note the x workitem or workgroup id because they are + // always initialized. + + bool Changed = addAttrsForIntrinsics(M, IntrinsicToAttr); + if (TT.getOS() == Triple::AMDHSA) + Changed |= addAttrsForIntrinsics(M, HSAIntrinsicToAttr); + + return Changed; +} + +ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { + return new AMDGPUAnnotateKernelFeatures(); +} Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -49,6 +49,7 @@ initializeSIFixSGPRLiveRangesPass(*PR); initializeSIFixControlFlowLiveIntervalsPass(*PR); initializeSILoadStoreOptimizerPass(*PR); + initializeAMDGPUAnnotateKernelFeaturesPass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -194,8 +195,10 @@ // functions, then we will generate code for the first function // without ever running any passes on the second. addPass(createBarrierNoopPass()); + // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. addPass(createAMDGPUOpenCLImageTypeLoweringPass()); + TargetPassConfig::addIRPasses(); } @@ -267,6 +270,11 @@ bool GCNPassConfig::addPreISel() { AMDGPUPassConfig::addPreISel(); + + // FIXME: We need to run a pass to propagate the attributes when calls are + // supported. + addPass(&AMDGPUAnnotateKernelFeaturesID); + addPass(createSinkingPass()); addPass(createSITypeRewriter()); addPass(createSIAnnotateControlFlowPass()); Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -15,6 +15,7 @@ add_llvm_target(AMDGPUCodeGen AMDILCFGStructurizer.cpp AMDGPUAlwaysInlinePass.cpp + AMDGPUAnnotateKernelFeatures.cpp AMDGPUAsmPrinter.cpp AMDGPUDiagnosticInfoUnsupported.cpp AMDGPUFrameLowering.cpp Index: test/CodeGen/AMDGPU/annotate-kernel-features.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/annotate-kernel-features.ll @@ -0,0 +1,193 @@ +; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefix=HSA -check-prefix=ALL %s +; RUN: opt -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefix=NOHSA -check-prefix=ALL %s + +declare i32 @llvm.r600.read.tgid.x() #0 +declare i32 @llvm.r600.read.tgid.y() #0 +declare i32 @llvm.r600.read.tgid.z() #0 + +declare i32 @llvm.r600.read.tidig.x() #0 +declare i32 @llvm.r600.read.tidig.y() #0 +declare i32 @llvm.r600.read.tidig.z() #0 + +declare i32 @llvm.r600.read.local.size.x() #0 +declare i32 @llvm.r600.read.local.size.y() #0 +declare i32 @llvm.r600.read.local.size.z() #0 + +declare i32 @llvm.r600.read.global.size.x() #0 +declare i32 @llvm.r600.read.global.size.y() #0 +declare i32 @llvm.r600.read.global.size.z() #0 + + +; ALL: define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 { +define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 { + %val = call i32 @llvm.r600.read.tgid.x() + store i32 %val, i32 addrspace(1)* %ptr + ret void +} + +; ALL: define void @use_tgid_y(i32 addrspace(1)* %ptr) #2 { +define void @use_tgid_y(i32 addrspace(1)* %ptr) #1 { + %val = call i32 @llvm.r600.read.tgid.y() + store i32 %val, i32 addrspace(1)* %ptr + ret void +} + +; ALL: define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 { +define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 { + %val0 = call i32 @llvm.r600.read.tgid.y() + store volatile i32 %val0, i32 addrspace(1)* %ptr + %val1 = call i32 @llvm.r600.read.tgid.y() + store volatile i32 %val1, i32 addrspace(1)* %ptr + ret void +} + +; ALL: define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 { +define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 { + %val0 = call i32 @llvm.r600.read.tgid.x() + %val1 = call i32 @llvm.r600.read.tgid.y() + store volatile i32 %val0, i32 addrspace(1)* %ptr + store volatile i32 %val1, i32 addrspace(1)* %ptr + ret void +} + +; ALL: define void @use_tgid_z(i32 addrspace(1)* %ptr) #3 { +define void @use_tgid_z(i32 addrspace(1)* %ptr) #1 { + %val = call i32 @llvm.r600.read.tgid.z() + store i32 %val, i32 addrspace(1)* %ptr + ret void +} + +; ALL: define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 { +define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 { + %val0 = call i32 @llvm.r600.read.tgid.x() + %val1 = call i32 @llvm.r600.read.tgid.z() + store volatile i32 %val0, i32 addrspace(1)* %ptr + store volatile i32 %val1, i32 addrspace(1)* %ptr + ret void +} + +; ALL: define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 { +define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 { + %val0 = call i32 @llvm.r600.read.tgid.y() + %val1 = call i32 @llvm.r600.read.tgid.z() + store volatile i32 %val0, i32 addrspace(1)* %ptr + store volatile i32 %val1, i32 addrspace(1)* %ptr + ret void +} + +; ALL: define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 { +define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 { + %val0 = call i32 @llvm.r600.read.tgid.x() + %val1 = call i32 @llvm.r600.read.tgid.y() + %val2 = call i32 @llvm.r600.read.tgid.z() + store volatile i32 %val0, i32 addrspace(1)* %ptr + store volatile i32 %val1, i32 addrspace(1)* %ptr + store volatile i32 %val2, i32 addrspace(1)* %ptr + ret void +} + +; ALL: define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 { +define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 { + %val = call i32 @llvm.r600.read.tidig.x() + store i32 %val, i32 addrspace(1)* %ptr + ret void +} + +; ALL: define void @use_tidig_y(i32 addrspace(1)* %ptr) #5 { +define void @use_tidig_y(i32 addrspace(1)* %ptr) #1 { + %val = call i32 @llvm.r600.read.tidig.y() + store i32 %val, i32 addrspace(1)* %ptr + ret void +} + +; ALL: define void @use_tidig_z(i32 addrspace(1)* %ptr) #6 { +define void @use_tidig_z(i32 addrspace(1)* %ptr) #1 { + %val = call i32 @llvm.r600.read.tidig.z() + store i32 %val, i32 addrspace(1)* %ptr + ret void +} + +; ALL: define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 { +define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 { + %val0 = call i32 @llvm.r600.read.tidig.x() + %val1 = call i32 @llvm.r600.read.tgid.x() + store volatile i32 %val0, i32 addrspace(1)* %ptr + store volatile i32 %val1, i32 addrspace(1)* %ptr + ret void +} + +; ALL: define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 { +define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 { + %val0 = call i32 @llvm.r600.read.tidig.y() + %val1 = call i32 @llvm.r600.read.tgid.y() + store volatile i32 %val0, i32 addrspace(1)* %ptr + store volatile i32 %val1, i32 addrspace(1)* %ptr + ret void +} + +; ALL: define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 { +define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 { + %val0 = call i32 @llvm.r600.read.tidig.x() + %val1 = call i32 @llvm.r600.read.tidig.y() + %val2 = call i32 @llvm.r600.read.tidig.z() + store volatile i32 %val0, i32 addrspace(1)* %ptr + store volatile i32 %val1, i32 addrspace(1)* %ptr + store volatile i32 %val2, i32 addrspace(1)* %ptr + ret void +} + +; ALL: define void @use_all_workitems(i32 addrspace(1)* %ptr) #9 { +define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 { + %val0 = call i32 @llvm.r600.read.tidig.x() + %val1 = call i32 @llvm.r600.read.tidig.y() + %val2 = call i32 @llvm.r600.read.tidig.z() + %val3 = call i32 @llvm.r600.read.tgid.x() + %val4 = call i32 @llvm.r600.read.tgid.y() + %val5 = call i32 @llvm.r600.read.tgid.z() + store volatile i32 %val0, i32 addrspace(1)* %ptr + store volatile i32 %val1, i32 addrspace(1)* %ptr + store volatile i32 %val2, i32 addrspace(1)* %ptr + store volatile i32 %val3, i32 addrspace(1)* %ptr + store volatile i32 %val4, i32 addrspace(1)* %ptr + store volatile i32 %val5, i32 addrspace(1)* %ptr + ret void +} + +; HSA: define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #10 { +; NOHSA: define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 { +define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 { + %val = call i32 @llvm.r600.read.local.size.x() + store i32 %val, i32 addrspace(1)* %ptr + ret void +} + +; HSA: define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #10 { +; NOHSA: define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 { +define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 { + %val = call i32 @llvm.r600.read.local.size.y() + store i32 %val, i32 addrspace(1)* %ptr + ret void +} + +; HSA: define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #10 { +; NOHSA: define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 { +define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 { + %val = call i32 @llvm.r600.read.local.size.z() + store i32 %val, i32 addrspace(1)* %ptr + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } + +; HSA: attributes #0 = { nounwind readnone } +; HSA: attributes #1 = { nounwind } +; HSA: attributes #2 = { nounwind "amdgpu-work-group-id-y" } +; HSA: attributes #3 = { nounwind "amdgpu-work-group-id-z" } +; HSA: attributes #4 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" } +; HSA: attributes #5 = { nounwind "amdgpu-work-item-id-y" } +; HSA: attributes #6 = { nounwind "amdgpu-work-item-id-z" } +; HSA: attributes #7 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-item-id-y" } +; HSA: attributes #8 = { nounwind "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" } +; HSA: attributes #9 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" } +; HSA: attributes #10 = { nounwind "amdgpu-dispatch-ptr" }