diff --git a/clang/lib/CodeGen/CodeGenAction.cpp b/clang/lib/CodeGen/CodeGenAction.cpp --- a/clang/lib/CodeGen/CodeGenAction.cpp +++ b/clang/lib/CodeGen/CodeGenAction.cpp @@ -290,6 +290,22 @@ if (Err) return true; } + + // If we linked in AMDGCN device libraries in RDC-mode we need these + // constants to be internal to each TU. This is required as these + // variables control math settings which can change per-TU and conflict + // after linking. + // TODO: This should be handled in the backend instead. + if (!LinkModules.empty() && Gen->CGM().getTriple().isAMDGCN()) { + const StringRef GVS[] = {"__oclc_daz_opt", "__oclc_unsafe_math_opt", + "__oclc_finite_only_opt", + "__oclc_correctly_rounded_sqrt32"}; + for (StringRef Name : GVS) { + if (llvm::GlobalVariable *GV = getModule()->getGlobalVariable(Name)) + GV->setLinkage(llvm::GlobalValue::InternalLinkage); + } + } + return false; // success } diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -931,6 +931,7 @@ if (getCodeGenOpts().SkipRaxSetup) getModule().addModuleFlag(llvm::Module::Override, "SkipRaxSetup", 1); + getTargetCodeGenInfo().emitTargetGlobals(*this); getTargetCodeGenInfo().emitTargetMetadata(*this, MangledDeclNames); EmitBackendOptionsMetadata(getCodeGenOpts()); diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h --- a/clang/lib/CodeGen/TargetInfo.h +++ b/clang/lib/CodeGen/TargetInfo.h @@ -76,6 +76,9 @@ CodeGen::CodeGenModule &CGM, const llvm::MapVector &MangledDeclNames) const {} + /// Provides a convenient hook to handle extra target-specific globals. + virtual void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const {} + /// Any further codegen related checks that need to be done on a function call /// in a target specific manner. virtual void checkFunctionCallABI(CodeGenModule &CGM, SourceLocation CallLoc, diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp --- a/clang/lib/CodeGen/TargetInfo.cpp +++ b/clang/lib/CodeGen/TargetInfo.cpp @@ -33,6 +33,7 @@ #include "llvm/IR/IntrinsicsS390.h" #include "llvm/IR/Type.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/TargetParser.h" #include "llvm/Support/raw_ostream.h" #include @@ -9287,6 +9288,8 @@ void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F, CodeGenModule &CGM) const; + void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const override; + void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const override; unsigned getOpenCLKernelCallingConv() const override; @@ -9402,6 +9405,79 @@ } } +/// Emits control constants used to change per-architecture behaviour in the +/// AMDGPU ROCm device libraries. +void AMDGPUTargetCodeGenInfo::emitTargetGlobals( + CodeGen::CodeGenModule &CGM) const { + if (!CGM.getTriple().isAMDGCN()) + return; + StringRef CPU = CGM.getTarget().getTargetOpts().CPU; + llvm::AMDGPU::GPUKind Kind = llvm::AMDGPU::parseArchAMDGCN(CPU); + unsigned Features = llvm::AMDGPU::getArchAttrAMDGCN(Kind); + if (Kind == llvm::AMDGPU::GK_NONE) + return; + + unsigned Minor; + unsigned Major; + StringRef Identifier = CPU.drop_while([](char C) { return !isDigit(C); }); + if (Identifier.take_back(2).getAsInteger(16, Minor) || + Identifier.drop_back(2).getAsInteger(10, Major)) + return; + + auto AddGlobal = [&](StringRef Name, unsigned Value, unsigned Size, + llvm::GlobalValue::LinkageTypes Linkage = + llvm::GlobalValue::LinkOnceAnyLinkage) { + if (CGM.getModule().getNamedGlobal(Name)) + return; + + auto *Type = + llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), Size); + auto *GV = new llvm::GlobalVariable( + CGM.getModule(), Type, true, Linkage, + llvm::ConstantInt::get(Type, Value), Name, nullptr, + llvm::GlobalValue::ThreadLocalMode::NotThreadLocal, + CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant)); + GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local); + GV->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility); + GV->setAlignment(CGM.getDataLayout().getABITypeAlign(Type)); + }; + + // The wavefront size is 64 if defined by the target or explicitly specified + // by the user. + bool Wavefront64 = + !(Features & llvm::AMDGPU::FEATURE_WAVE32) || + llvm::is_contained(CGM.getTarget().getTargetOpts().FeaturesAsWritten, + "+wavefrontsize64"); + + // Different math flags set by the current floating point contract. + bool RelaxedMath = CGM.getLangOpts().FastMath; + bool UnsafeMath = CGM.getLangOpts().UnsafeFPMath; + bool DenormAreZero = CGM.getCodeGenOpts().FP32DenormalMode == + llvm::DenormalMode::getPreserveSign(); + bool FiniteOnly = + CGM.getLangOpts().NoHonorInfs && CGM.getLangOpts().NoHonorNaNs; + + // Set correct square root rounding depending on the target lanauge. + bool CorrectSqrt = CGM.getLangOpts().OpenCL + ? CGM.getCodeGenOpts().OpenCLCorrectlyRoundedDivSqrt + : CGM.getCodeGenOpts().HIPCorrectlyRoundedDivSqrt; + + // Control constants for math operations. + AddGlobal("__oclc_daz_opt", DenormAreZero, /*Size=*/8); + AddGlobal("__oclc_finite_only_opt", FiniteOnly || RelaxedMath, /*Size=*/8); + AddGlobal("__oclc_unsafe_math_opt", UnsafeMath || RelaxedMath, /*Size=*/8); + AddGlobal("__oclc_correctly_rounded_sqrt32", CorrectSqrt, /*Size=*/8); + + // Control constants for the system. + AddGlobal("__oclc_wavefrontsize64", Wavefront64, /*Size=*/8, + llvm::GlobalValue::LinkOnceODRLinkage); + AddGlobal("__oclc_ISA_version", Minor + Major * 1000, /*Size=*/32, + llvm::GlobalValue::LinkOnceODRLinkage); + AddGlobal("__oclc_ABI_version", + CGM.getTarget().getTargetOpts().CodeObjectVersion, /*Size=*/32, + llvm::GlobalValue::LinkOnceODRLinkage); +} + void AMDGPUTargetCodeGenInfo::setTargetAttributes( const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const { if (requiresAMDGPUProtectedVisibility(D, GV)) { diff --git a/clang/test/CodeGen/amdgcn-control-constants.c b/clang/test/CodeGen/amdgcn-control-constants.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/amdgcn-control-constants.c @@ -0,0 +1,49 @@ +// Check that we generate all the expected default features for the target. +// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx90a -S -emit-llvm -o - %s | FileCheck %s --check-prefix=GFX90A +// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx1030 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=GFX1030 + +// GFX90A: @__oclc_daz_opt = linkonce hidden local_unnamed_addr addrspace(4) constant i8 0 +// GFX90A: @__oclc_finite_only_opt = linkonce hidden local_unnamed_addr addrspace(4) constant i8 0 +// GFX90A: @__oclc_unsafe_math_opt = linkonce hidden local_unnamed_addr addrspace(4) constant i8 0 +// GFX90A: @__oclc_correctly_rounded_sqrt32 = linkonce hidden local_unnamed_addr addrspace(4) constant i8 1 +// GFX90A: @__oclc_wavefrontsize64 = linkonce_odr hidden local_unnamed_addr addrspace(4) constant i8 1 +// GFX90A: @__oclc_ISA_version = linkonce_odr hidden local_unnamed_addr addrspace(4) constant i32 9010 +// GFX90A: @__oclc_ABI_version = linkonce_odr hidden local_unnamed_addr addrspace(4) constant i32 400 + +// GFX1030: @__oclc_daz_opt = linkonce hidden local_unnamed_addr addrspace(4) constant i8 0 +// GFX1030: @__oclc_finite_only_opt = linkonce hidden local_unnamed_addr addrspace(4) constant i8 0 +// GFX1030: @__oclc_unsafe_math_opt = linkonce hidden local_unnamed_addr addrspace(4) constant i8 0 +// GFX1030: @__oclc_correctly_rounded_sqrt32 = linkonce hidden local_unnamed_addr addrspace(4) constant i8 1 +// GFX1030: @__oclc_wavefrontsize64 = linkonce_odr hidden local_unnamed_addr addrspace(4) constant i8 0 +// GFX1030: @__oclc_ISA_version = linkonce_odr hidden local_unnamed_addr addrspace(4) constant i32 10048 +// GFX1030: @__oclc_ABI_version = linkonce_odr hidden local_unnamed_addr addrspace(4) constant i32 400 + +// Check that we can override the wavefront features. +// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx1030 -target-feature +wavefrontsize64 \ +// RUN: -S -emit-llvm -o - %s | FileCheck %s --check-prefix=WAVEFRONT +// WAVEFRONT: @__oclc_wavefrontsize64 = linkonce_odr hidden local_unnamed_addr addrspace(4) constant i8 1 + +// Check that we can enable denormalization at zero. +// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx90a -fdenormal-fp-math-f32=preserve-sign,preserve-sign \ +// RUN: -S -emit-llvm -o - %s | FileCheck %s --check-prefix=DENORM-AT-ZERO +// DENORM-AT-ZERO: @__oclc_daz_opt = linkonce hidden local_unnamed_addr addrspace(4) constant i8 1 + +// Check that we can enable finite math. +// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx90a -ffinite-math-only \ +// RUN: -S -emit-llvm -o - %s | FileCheck %s --check-prefix=FINITE-MATH +// FINITE-MATH: @__oclc_finite_only_opt = linkonce hidden local_unnamed_addr addrspace(4) constant i8 1 +// FINITE-MATH: @__oclc_unsafe_math_opt = linkonce hidden local_unnamed_addr addrspace(4) constant i8 0 + +// Check that we can enable unsafe math. +// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx90a -menable-unsafe-fp-math \ +// RUN: -S -emit-llvm -o - %s | FileCheck %s --check-prefix=UNSAFE-MATH +// UNSAFE-MATH: @__oclc_finite_only_opt = linkonce hidden local_unnamed_addr addrspace(4) constant i8 0 +// UNSAFE-MATH: @__oclc_unsafe_math_opt = linkonce hidden local_unnamed_addr addrspace(4) constant i8 1 + +// Check that we can disable/enable correctly rounded square roots. +// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx90a -fno-hip-fp32-correctly-rounded-divide-sqrt \ +// RUN: -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CORRECT-SQRT +// CORRECT-SQRT: @__oclc_correctly_rounded_sqrt32 = linkonce hidden local_unnamed_addr addrspace(4) constant i8 0 +// RUN: %clang_cc1 -x cl -triple amdgcn-amd-amdhsa -target-cpu gfx90a -cl-fp32-correctly-rounded-divide-sqrt \ +// RUN: -disable-llvm-optzns -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CL-CORRECT-SQRT +// CL-CORRECT-SQRT: @__oclc_correctly_rounded_sqrt32 = linkonce hidden local_unnamed_addr addrspace(4) constant i8 1 diff --git a/clang/test/CodeGenCUDA/amdgcn-control-constants.hip b/clang/test/CodeGenCUDA/amdgcn-control-constants.hip new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenCUDA/amdgcn-control-constants.hip @@ -0,0 +1,46 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-globals --include-generated-funcs --global-value-regex "__oclc_daz_opt" +// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -fcuda-is-device -target-cpu gfx90a -emit-llvm-bc -o %t.bc -DLIBRARY %s +// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -fcuda-is-device -target-cpu gfx90a -mlink-builtin-bitcode %t.bc -S -emit-llvm -o - %s | FileCheck %s + +// REQUIRES: amdgpu-registered-target + +#include "Inputs/cuda.h" + +#ifdef LIBRARY + +extern unsigned char __constant__ __oclc_daz_opt; + +__device__ int foo(void) { + return __oclc_daz_opt ? 1 : 0; +} + +#else + +extern __device__ int foo(void); + +__device__ void bar(void) { + foo(); +} + +#endif +//. +// CHECK: @__oclc_daz_opt = internal local_unnamed_addr addrspace(4) constant i8 0, align 1 +//. +// CHECK-LABEL: define {{[^@]+}}@_Z3barv +// CHECK-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = call noundef i32 @_Z3foov() #[[ATTR1:[0-9]+]] +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@_Z3foov +// CHECK-SAME: () #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr addrspacecast (ptr addrspace(4) @__oclc_daz_opt to ptr), align 1 +// CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i8 [[TMP0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = zext i1 [[TOBOOL]] to i64 +// CHECK-NEXT: [[COND:%.*]] = select i1 [[TOBOOL]], i32 1, i32 0 +// CHECK-NEXT: ret i32 [[COND]] +//