diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -27,6 +27,7 @@ #include "clang/AST/OSLog.h" #include "clang/Basic/TargetBuiltins.h" #include "clang/Basic/TargetInfo.h" +#include "clang/Basic/TargetOptions.h" #include "clang/CodeGen/CGFunctionInfo.h" #include "clang/Frontend/FrontendDiagnostic.h" #include "llvm/ADT/APFloat.h" @@ -17098,24 +17099,61 @@ } // \p Index is 0, 1, and 2 for x, y, and z dimension, respectively. +/// Emit code based on Code Object ABI version. +/// COV_4 : Emit code to use dispatch ptr +/// COV_5 : Emit code to use implicitarg ptr +/// COV_NONE : Emit code to load a global variable "llvm.amdgcn.abi.version" +/// and use its value for COV_4 or COV_5 approach. It is used for +/// compiling device libraries in an ABI-agnostic way. +/// +/// Note: "llvm.amdgcn.abi.version" is supposed to be emitted and intialized by +/// clang during compilation of user code. Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) { - bool IsCOV_5 = CGF.getTarget().getTargetOpts().CodeObjectVersion == - clang::TargetOptions::COV_5; - Constant *Offset; - Value *DP; - if (IsCOV_5) { + llvm::LoadInst *LD; + + auto Cov = CGF.getTarget().getTargetOpts().CodeObjectVersion; + + if (Cov == clang::TargetOptions::COV_None) { + auto *ABIVersionC = CGF.CGM.GetOrCreateLLVMGlobal( + "llvm.amdgcn.abi.version", CGF.Int32Ty, LangAS::Default, nullptr, + CodeGen::NotForDefinition); + + // This load will be eliminated by the IPSCCP because it is constant + // weak_odr without externally_initialized. Either changing it to weak or + // adding externally_initialized will keep the load. + Value *ABIVersion = CGF.Builder.CreateAlignedLoad(CGF.Int32Ty, ABIVersionC, + CGF.CGM.getIntAlign()); + + Value *IsCOV5 = CGF.Builder.CreateICmpSGE( + ABIVersion, + llvm::ConstantInt::get(CGF.Int32Ty, clang::TargetOptions::COV_5)); + // Indexing the implicit kernarg segment. - Offset = llvm::ConstantInt::get(CGF.Int32Ty, 12 + Index * 2); - DP = EmitAMDGPUImplicitArgPtr(CGF); - } else { + Value *ImplicitGEP = CGF.Builder.CreateConstGEP1_32( + CGF.Int8Ty, EmitAMDGPUImplicitArgPtr(CGF), 12 + Index * 2); + // Indexing the HSA kernel_dispatch_packet struct. - Offset = llvm::ConstantInt::get(CGF.Int32Ty, 4 + Index * 2); - DP = EmitAMDGPUDispatchPtr(CGF); + Value *DispatchGEP = CGF.Builder.CreateConstGEP1_32( + CGF.Int8Ty, EmitAMDGPUDispatchPtr(CGF), 4 + Index * 2); + + auto Result = CGF.Builder.CreateSelect(IsCOV5, ImplicitGEP, DispatchGEP); + LD = CGF.Builder.CreateLoad( + Address(Result, CGF.Int16Ty, CharUnits::fromQuantity(2))); + } else { + Value *GEP = nullptr; + if (Cov == clang::TargetOptions::COV_5) { + // Indexing the implicit kernarg segment. + GEP = CGF.Builder.CreateConstGEP1_32( + CGF.Int8Ty, EmitAMDGPUImplicitArgPtr(CGF), 12 + Index * 2); + } else { + // Indexing the HSA kernel_dispatch_packet struct. + GEP = CGF.Builder.CreateConstGEP1_32( + CGF.Int8Ty, EmitAMDGPUDispatchPtr(CGF), 4 + Index * 2); + } + LD = CGF.Builder.CreateLoad( + Address(GEP, CGF.Int16Ty, CharUnits::fromQuantity(2))); } - auto *GEP = CGF.Builder.CreateGEP(CGF.Int8Ty, DP, Offset); - auto *LD = CGF.Builder.CreateLoad( - Address(GEP, CGF.Int16Ty, CharUnits::fromQuantity(2))); llvm::MDBuilder MDHelper(CGF.getLLVMContext()); llvm::MDNode *RNode = MDHelper.createRange(APInt(16, 1), APInt(16, CGF.getTarget().getMaxOpenCLWorkGroupSize() + 1)); diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h --- a/clang/lib/CodeGen/CodeGenModule.h +++ b/clang/lib/CodeGen/CodeGenModule.h @@ -1571,6 +1571,11 @@ void handleAMDGPUWavesPerEUAttr(llvm::Function *F, const AMDGPUWavesPerEUAttr *A); + llvm::Constant * + GetOrCreateLLVMGlobal(StringRef MangledName, llvm::Type *Ty, LangAS AddrSpace, + const VarDecl *D, + ForDefinition_t IsForDefinition = NotForDefinition); + private: llvm::Constant *GetOrCreateLLVMFunction( StringRef MangledName, llvm::Type *Ty, GlobalDecl D, bool ForVTable, @@ -1593,11 +1598,6 @@ void UpdateMultiVersionNames(GlobalDecl GD, const FunctionDecl *FD, StringRef &CurName); - llvm::Constant * - GetOrCreateLLVMGlobal(StringRef MangledName, llvm::Type *Ty, LangAS AddrSpace, - const VarDecl *D, - ForDefinition_t IsForDefinition = NotForDefinition); - bool GetCPUAndFeaturesAttributes(GlobalDecl GD, llvm::AttrBuilder &AttrBuilder, bool SetTargetFeatures = true); diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -1203,6 +1203,8 @@ getModule().addModuleFlag(llvm::Module::Error, "MaxTLSAlign", getContext().getTargetInfo().getMaxTLSAlign()); + getTargetCodeGenInfo().emitTargetGlobals(*this); + getTargetCodeGenInfo().emitTargetMetadata(*this, MangledDeclNames); EmitBackendOptionsMetadata(getCodeGenOpts()); diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h --- a/clang/lib/CodeGen/TargetInfo.h +++ b/clang/lib/CodeGen/TargetInfo.h @@ -81,6 +81,9 @@ CodeGen::CodeGenModule &CGM, const llvm::MapVector &MangledDeclNames) const {} + /// Provides a convenient hook to handle extra target-specific globals. + virtual void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const {} + /// Any further codegen related checks that need to be done on a function call /// in a target specific manner. virtual void checkFunctionCallABI(CodeGenModule &CGM, SourceLocation CallLoc, diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -8,6 +8,7 @@ #include "ABIInfoImpl.h" #include "TargetInfo.h" +#include "clang/Basic/TargetOptions.h" using namespace clang; using namespace clang::CodeGen; @@ -274,6 +275,8 @@ void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F, CodeGenModule &CGM) const; + void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const override; + void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const override; unsigned getOpenCLKernelCallingConv() const override; @@ -354,6 +357,28 @@ } } +/// Emits control constants used to change per-architecture behaviour in the +/// AMDGPU ROCm device libraries. +void AMDGPUTargetCodeGenInfo::emitTargetGlobals( + CodeGen::CodeGenModule &CGM) const { + StringRef Name = "llvm.amdgcn.abi.version"; + if (CGM.getModule().getNamedGlobal(Name)) + return; + + auto *Type = llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), 32); + llvm::Constant *COV = llvm::ConstantInt::get( + Type, CGM.getTarget().getTargetOpts().CodeObjectVersion); + + // It needs to be constant weak_odr without externally_initialized so that + // the load instuction can be eliminated by the IPSCCP. + auto *GV = new llvm::GlobalVariable( + CGM.getModule(), Type, true, llvm::GlobalValue::WeakODRLinkage, COV, Name, + nullptr, llvm::GlobalValue::ThreadLocalMode::NotThreadLocal, + CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant)); + GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local); + GV->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility); +} + void AMDGPUTargetCodeGenInfo::setTargetAttributes( const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const { if (requiresAMDGPUProtectedVisibility(D, GV)) { diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -1370,7 +1370,10 @@ // matches the current toolchain triple. If it is not present // at all, target and host share a toolchain. if (A->getOption().matches(options::OPT_m_Group)) { - if (SameTripleAsHost) + // Pass code object version to device toolchain + // to correctly set metadata in intermediate files. + if (SameTripleAsHost || + A->getOption().matches(options::OPT_mcode_object_version_EQ)) DAL->append(A); else Modified = true; diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -8645,6 +8645,14 @@ CmdArgs.push_back("--device-debug"); } + // code-object-version=X needs to be passed to clang-linker-wrapper to ensure + // that it is used by lld. + if (const Arg *A = Args.getLastArg(options::OPT_mcode_object_version_EQ)) { + CmdArgs.push_back(Args.MakeArgString("-mllvm")); + CmdArgs.push_back(Args.MakeArgString( + Twine("--amdhsa-code-object-version=") + A->getValue())); + } + for (const auto &A : Args.getAllArgValues(options::OPT_Xcuda_ptxas)) CmdArgs.push_back(Args.MakeArgString("--ptxas-arg=" + A)); diff --git a/clang/test/CodeGenCUDA/amdgpu-code-object-version-linking.cu b/clang/test/CodeGenCUDA/amdgpu-code-object-version-linking.cu new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenCUDA/amdgpu-code-object-version-linking.cu @@ -0,0 +1,96 @@ +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm-bc \ +// RUN: -mcode-object-version=4 -DUSER -x hip -o %t_4.bc %s + +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm-bc \ +// RUN: -mcode-object-version=5 -DUSER -x hip -o %t_5.bc %s + +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm-bc \ +// RUN: -mcode-object-version=none -DDEVICELIB -x hip -o %t_0.bc %s + +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -O3 \ +// RUN: %t_4.bc -mlink-builtin-bitcode %t_0.bc -o - |\ +// RUN: FileCheck -check-prefix=LINKED4 %s + +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -O3 \ +// RUN: %t_5.bc -mlink-builtin-bitcode %t_0.bc -o - |\ +// RUN: FileCheck -check-prefix=LINKED5 %s + +#include "Inputs/cuda.h" + +// LINKED4: @llvm.amdgcn.abi.version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 400 +// LINKED4-LABEL: bar +// LINKED4-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}} +// LINKED4-NOT: icmp sge i32 %{{.*}}, 500 +// LINKED4: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// LINKED4: [[GEP_5_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 12 +// LINKED4: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// LINKED4: [[GEP_4_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 4 +// LINKED4: select i1 false, ptr addrspace(4) [[GEP_5_X]], ptr addrspace(4) [[GEP_4_X]] +// LINKED4: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef + +// LINKED4-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}} +// LINKED4-NOT: icmp sge i32 %{{.*}}, 500 +// LINKED4: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// LINKED4: [[GEP_5_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 14 +// LINKED4: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// LINKED4: [[GEP_4_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 6 +// LINKED4: select i1 false, ptr addrspace(4) [[GEP_5_Y]], ptr addrspace(4) [[GEP_4_Y]] +// LINKED4: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef + +// LINKED4-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}} +// LINKED4-NOT: icmp sge i32 %{{.*}}, 500 +// LINKED4: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// LINKED4: [[GEP_5_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 16 +// LINKED4: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// LINKED4: [[GEP_4_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 8 +// LINKED4: select i1 false, ptr addrspace(4) [[GEP_5_Z]], ptr addrspace(4) [[GEP_4_Z]] +// LINKED4: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef +// LINKED4: "amdgpu_code_object_version", i32 400 + +// LINKED5: llvm.amdgcn.abi.version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500 +// LINKED5-LABEL: bar +// LINKED5-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}} +// LINKED5-NOT: icmp sge i32 %{{.*}}, 500 +// LINKED5: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// LINKED5: [[GEP_5_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 12 +// LINKED5: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// LINKED5: [[GEP_4_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 4 +// LINKED5: select i1 true, ptr addrspace(4) [[GEP_5_X]], ptr addrspace(4) [[GEP_4_X]] +// LINKED5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef + +// LINKED5-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}} +// LINKED5-NOT: icmp sge i32 %{{.*}}, 500 +// LINKED5: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// LINKED5: [[GEP_5_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 14 +// LINKED5: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// LINKED5: [[GEP_4_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 6 +// LINKED5: select i1 true, ptr addrspace(4) [[GEP_5_Y]], ptr addrspace(4) [[GEP_4_Y]] +// LINKED5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef + +// LINKED5-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}} +// LINKED5-NOT: icmp sge i32 %{{.*}}, 500 +// LINKED5: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// LINKED5: [[GEP_5_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 16 +// LINKED5: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// LINKED5: [[GEP_4_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 8 +// LINKED5: select i1 true, ptr addrspace(4) [[GEP_5_Z]], ptr addrspace(4) [[GEP_4_Z]] +// LINKED5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef +// LINKED5: "amdgpu_code_object_version", i32 500 + +#ifdef DEVICELIB +__device__ void bar(int *x, int *y, int *z) +{ + *x = __builtin_amdgcn_workgroup_size_x(); + *y = __builtin_amdgcn_workgroup_size_y(); + *z = __builtin_amdgcn_workgroup_size_z(); +} +#endif + +#ifdef USER +__device__ void bar(int *x, int *y, int *z); +__device__ void foo() +{ + int *x, *y, *z; + bar(x, y, z); +} +#endif diff --git a/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu b/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu --- a/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu +++ b/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu @@ -7,6 +7,10 @@ // RUN: -fcuda-is-device -mcode-object-version=5 -emit-llvm -o - -x hip %s \ // RUN: | FileCheck -check-prefix=COV5 %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \ +// RUN: -fcuda-is-device -mcode-object-version=none -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefix=COVNONE %s + #include "Inputs/cuda.h" // PRECOV5-LABEL: test_get_workgroup_size @@ -26,6 +30,36 @@ // COV5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef // COV5: getelementptr i8, ptr addrspace(4) %{{.*}}, i32 16 // COV5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef + + +// COVNONE-LABEL: test_get_workgroup_size +// COVNONE: load i32, ptr addrspacecast (ptr addrspace(1) @llvm.amdgcn.abi.version to ptr), align {{.*}} +// COVNONE: [[ABI5_X:%.*]] = icmp sge i32 %{{.*}}, 500 +// COVNONE: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// COVNONE: [[GEP_5_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 12 +// COVNONE: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// COVNONE: [[GEP_4_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 4 +// COVNONE: select i1 [[ABI5_X]], ptr addrspace(4) [[GEP_5_X]], ptr addrspace(4) [[GEP_4_X]] +// COVNONE: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef + +// COVNONE: load i32, ptr addrspacecast (ptr addrspace(1) @llvm.amdgcn.abi.version to ptr), align {{.*}} +// COVNONE: [[ABI5_Y:%.*]] = icmp sge i32 %{{.*}}, 500 +// COVNONE: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// COVNONE: [[GEP_5_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 14 +// COVNONE: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// COVNONE: [[GEP_4_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 6 +// COVNONE: select i1 [[ABI5_Y]], ptr addrspace(4) [[GEP_5_Y]], ptr addrspace(4) [[GEP_4_Y]] +// COVNONE: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef + +// COVNONE: load i32, ptr addrspacecast (ptr addrspace(1) @llvm.amdgcn.abi.version to ptr), align {{.*}} +// COVNONE: [[ABI5_Z:%.*]] = icmp sge i32 %{{.*}}, 500 +// COVNONE: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// COVNONE: [[GEP_5_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 16 +// COVNONE: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// COVNONE: [[GEP_4_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 8 +// COVNONE: select i1 [[ABI5_Z]], ptr addrspace(4) [[GEP_5_Z]], ptr addrspace(4) [[GEP_4_Z]] +// COVNONE: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef + __device__ void test_get_workgroup_size(int d, int *out) { switch (d) { diff --git a/clang/test/CodeGenOpenCL/opencl_types.cl b/clang/test/CodeGenOpenCL/opencl_types.cl --- a/clang/test/CodeGenOpenCL/opencl_types.cl +++ b/clang/test/CodeGenOpenCL/opencl_types.cl @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -cl-std=CL2.0 %s -triple "spir-unknown-unknown" -emit-llvm -o - -O0 | FileCheck %s --check-prefixes=CHECK-COM,CHECK-SPIR -// RUN: %clang_cc1 -cl-std=CL2.0 %s -triple "amdgcn--amdhsa" -emit-llvm -o - -O0 | FileCheck %s --check-prefixes=CHECK-COM,CHECK-AMDGCN +// RUN: %clang_cc1 -cl-std=CL2.0 %s -triple "spir-unknown-unknown" -emit-llvm -o - -O0 | FileCheck %s --check-prefix=CHECK-SPIR +// RUN: %clang_cc1 -cl-std=CL2.0 %s -triple "amdgcn--amdhsa" -emit-llvm -o - -O0 | FileCheck %s --check-prefix=CHECK-AMDGCN #define CLK_ADDRESS_CLAMP_TO_EDGE 2 #define CLK_NORMALIZED_COORDS_TRUE 1 @@ -7,7 +7,6 @@ #define CLK_FILTER_LINEAR 0x20 constant sampler_t glb_smp = CLK_ADDRESS_CLAMP_TO_EDGE|CLK_NORMALIZED_COORDS_TRUE|CLK_FILTER_NEAREST; -// CHECK-COM-NOT: constant i32 void fnc1(image1d_t img) {} // CHECK-SPIR: @fnc1(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 0) diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -403,6 +403,12 @@ llvm::copy(LinkerArgs, std::back_inserter(CmdArgs)); } + // Pass on -mllvm options to the clang invocation. + for (const opt::Arg *Arg : Args.filtered(OPT_mllvm)) { + CmdArgs.push_back("-mllvm"); + CmdArgs.push_back(Arg->getValue()); + } + if (Args.hasArg(OPT_debug)) CmdArgs.push_back("-g"); diff --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt --- a/openmp/libomptarget/DeviceRTL/CMakeLists.txt +++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt @@ -288,7 +288,7 @@ add_custom_target(omptarget.devicertl.amdgpu) foreach(gpu_arch ${LIBOMPTARGET_DEVICE_ARCHITECTURES}) if("${gpu_arch}" IN_LIST all_amdgpu_architectures) - compileDeviceRTLLibrary(${gpu_arch} amdgpu amdgcn-amd-amdhsa) + compileDeviceRTLLibrary(${gpu_arch} amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none) elseif("${gpu_arch}" IN_LIST all_nvptx_architectures) compileDeviceRTLLibrary(${gpu_arch} nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx61) else() diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -381,6 +381,9 @@ /// Get the executable. hsa_executable_t getExecutable() const { return Executable; } + /// Get to Code Object Version of the ELF + uint16_t getELFABIVersion() const { return ELFABIVersion; } + /// Find an HSA device symbol by its name on the executable. Expected findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const; @@ -401,6 +404,7 @@ hsa_executable_t Executable; hsa_code_object_t CodeObject; StringMap KernelInfoMap; + uint16_t ELFABIVersion; }; /// Class implementing the AMDGPU kernel functionalities which derives from the @@ -408,8 +412,7 @@ struct AMDGPUKernelTy : public GenericKernelTy { /// Create an AMDGPU kernel with a name and an execution mode. AMDGPUKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode) - : GenericKernelTy(Name, ExecutionMode), - ImplicitArgsSize(sizeof(utils::AMDGPUImplicitArgsTy)) {} + : GenericKernelTy(Name, ExecutionMode) {} /// Initialize the AMDGPU kernel. Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override { @@ -450,6 +453,9 @@ // TODO: Read the kernel descriptor for the max threads per block. May be // read from the image. + ImplicitArgsSize = utils::getImplicitArgsSize(AMDImage.getELFABIVersion()); + DP("ELFABIVersion: %d\n", AMDImage.getELFABIVersion()); + // Get additional kernel info read from image KernelInfo = AMDImage.getKernelInfo(getName()); if (!KernelInfo.has_value()) @@ -476,6 +482,10 @@ /// Get the HSA kernel object representing the kernel function. uint64_t getKernelObject() const { return KernelObject; } + /// Get the size of implicitargs based on the code object version + /// @return 56 for cov4 and 256 for cov5 + uint32_t getImplicitArgsSize() const { return ImplicitArgsSize; } + private: /// The kernel object to execute. uint64_t KernelObject; @@ -486,7 +496,7 @@ uint32_t PrivateSize; /// The size of implicit kernel arguments. - const uint32_t ImplicitArgsSize; + uint32_t ImplicitArgsSize; /// Additional Info for the AMD GPU Kernel std::optional KernelInfo; @@ -2627,8 +2637,8 @@ if (Result) return Plugin::error("Loaded HSA executable does not validate"); - if (auto Err = - utils::readAMDGPUMetaDataFromImage(getMemoryBuffer(), KernelInfoMap)) + if (auto Err = utils::readAMDGPUMetaDataFromImage( + getMemoryBuffer(), KernelInfoMap, ELFABIVersion)) return Err; return Plugin::success(); @@ -2993,6 +3003,15 @@ if (GenericDevice.getRPCServer()) Stream->setRPCServer(GenericDevice.getRPCServer()); + // Only COV5 implicitargs needs to be set. COV4 implicitargs are not used. + if (getImplicitArgsSize() == sizeof(utils::AMDGPUImplicitArgsTy)) { + ImplArgs->BlockCountX = NumBlocks; + ImplArgs->GroupSizeX = NumThreads; + ImplArgs->GroupSizeY = 1; + ImplArgs->GroupSizeZ = 1; + ImplArgs->GridDims = 1; + } + // Push the kernel launch into the stream. return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks, GroupSize, ArgsMemoryManager); diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h --- a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h @@ -25,6 +25,7 @@ #include "llvm/Support/MemoryBufferRef.h" #include "llvm/Support/YAMLTraits.h" +using namespace llvm::ELF; namespace llvm { namespace omp { @@ -32,19 +33,29 @@ namespace plugin { namespace utils { -// The implicit arguments of AMDGPU kernels. +// The implicit arguments of COV5 AMDGPU kernels. struct AMDGPUImplicitArgsTy { - uint64_t OffsetX; - uint64_t OffsetY; - uint64_t OffsetZ; - uint64_t HostcallPtr; - uint64_t Unused0; - uint64_t Unused1; - uint64_t Unused2; + uint32_t BlockCountX; + uint32_t BlockCountY; + uint32_t BlockCountZ; + uint16_t GroupSizeX; + uint16_t GroupSizeY; + uint16_t GroupSizeZ; + uint8_t Unused0[46]; // 46 byte offset. + uint16_t GridDims; + uint8_t Unused1[190]; // 190 byte offset. }; -static_assert(sizeof(AMDGPUImplicitArgsTy) == 56, - "Unexpected size of implicit arguments"); +// Dummy struct for COV4 implicitargs. +struct AMDGPUImplicitArgsTyCOV4 { + uint8_t Unused[56]; +}; + +uint32_t getImplicitArgsSize(uint16_t Version) { + return Version < ELF::ELFABIVERSION_AMDGPU_HSA_V5 + ? sizeof(AMDGPUImplicitArgsTyCOV4) + : sizeof(AMDGPUImplicitArgsTy); +} /// Parse a TargetID to get processor arch and feature map. /// Returns processor subarch. @@ -295,7 +306,8 @@ /// Reads the AMDGPU specific metadata from the ELF file and propagates the /// KernelInfoMap Error readAMDGPUMetaDataFromImage(MemoryBufferRef MemBuffer, - StringMap &KernelInfoMap) { + StringMap &KernelInfoMap, + uint16_t &ELFABIVersion) { Error Err = Error::success(); // Used later as out-parameter auto ELFOrError = object::ELF64LEFile::create(MemBuffer.getBuffer()); @@ -305,6 +317,12 @@ const object::ELF64LEFile ELFObj = ELFOrError.get(); ArrayRef Sections = cantFail(ELFObj.sections()); KernelInfoReader Reader(KernelInfoMap); + + // Read the code object version from ELF image header + auto Header = ELFObj.getHeader(); + ELFABIVersion = (uint8_t)(Header.e_ident[ELF::EI_ABIVERSION]); + DP("ELFABIVERSION Version: %u\n", ELFABIVersion); + for (const auto &S : Sections) { if (S.sh_type != ELF::SHT_NOTE) continue;