diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -467,6 +467,8 @@ /// propagate signaling NaN inputs per IEEE 754-2008 (AMDGPU Only) CODEGENOPT(EmitIEEENaNCompliantInsts, 1, 1) +CODEGENOPT(GPUFastRelaxedMath, 1, 0) + // Whether to emit Swift Async function extended frame information: auto, // never, always. ENUM_CODEGENOPT(SwiftAsyncFramePointer, SwiftAsyncFramePointerKind, 2, diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -964,6 +964,10 @@ Alias; def fno_cuda_flush_denormals_to_zero : Flag<["-"], "fno-cuda-flush-denormals-to-zero">, Alias; +defm gpu_fast_relaxed_math : BoolFOption<"gpu-fast-relaxed-math", + CodeGenOpts<"GPUFastRelaxedMath">, DefaultFalse, + PosFlag, + NegFlag>; defm gpu_rdc : BoolFOption<"gpu-rdc", LangOpts<"GPURelocatableDeviceCode">, DefaultFalse, PosFlag, diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -919,6 +919,7 @@ if (getCodeGenOpts().SkipRaxSetup) getModule().addModuleFlag(llvm::Module::Override, "SkipRaxSetup", 1); + getTargetCodeGenInfo().emitTargetGlobals(*this); getTargetCodeGenInfo().emitTargetMetadata(*this, MangledDeclNames); EmitBackendOptionsMetadata(getCodeGenOpts()); diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h --- a/clang/lib/CodeGen/TargetInfo.h +++ b/clang/lib/CodeGen/TargetInfo.h @@ -63,6 +63,9 @@ CodeGen::CodeGenModule &CGM, const llvm::MapVector &MangledDeclNames) const {} + /// Provides a convenient hook to handle extra target-specific globals. + virtual void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const {} + /// Any further codegen related checks that need to be done on a function call /// in a target specific manner. virtual void checkFunctionCallABI(CodeGenModule &CGM, SourceLocation CallLoc, diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp --- a/clang/lib/CodeGen/TargetInfo.cpp +++ b/clang/lib/CodeGen/TargetInfo.cpp @@ -34,6 +34,7 @@ #include "llvm/IR/IntrinsicsS390.h" #include "llvm/IR/Type.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/TargetParser.h" #include "llvm/Support/raw_ostream.h" #include // std::sort @@ -9307,6 +9308,8 @@ void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F, CodeGenModule &CGM) const; + void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const override; + void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const override; unsigned getOpenCLKernelCallingConv() const override; @@ -9422,6 +9425,61 @@ } } +void AMDGPUTargetCodeGenInfo::emitTargetGlobals( + CodeGen::CodeGenModule &CGM) const { + if (!CGM.getTriple().isAMDGCN()) + return; + StringRef CPU = CGM.getTarget().getTargetOpts().CPU; + // Check if we have any function declarations of `__ocml` or `__oclc` + llvm::AMDGPU::GPUKind Kind = llvm::AMDGPU::parseArchAMDGCN(CPU); + unsigned Features = llvm::AMDGPU::getArchAttrAMDGCN(Kind); + if (Kind == llvm::AMDGPU::GK_NONE) + return; + + unsigned Minor; + unsigned Major; + StringRef Identifier = CPU.drop_while([](char C) { return !isDigit(C); }); + if (Identifier.take_back(2).getAsInteger(16, Minor) || + Identifier.drop_back(2).getAsInteger(10, Major)) + return; + + auto AddGlobal = [&](StringRef Name, unsigned Value, unsigned Size = 8) { + if (CGM.getModule().getNamedGlobal(Name)) + return; + + auto *Type = + llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), Size); + auto *GV = new llvm::GlobalVariable( + CGM.getModule(), Type, true, + llvm::GlobalValue::LinkageTypes::WeakODRLinkage, + llvm::ConstantInt::get(Type, Value), Name, nullptr, + llvm::GlobalValue::ThreadLocalMode::NotThreadLocal, 4); + GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local); + GV->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility); + GV->setAlignment(CGM.getDataLayout().getABITypeAlign(Type)); + }; + + // TODO: Add flags to toggle these as-needed. + bool DenormAtZero = !((Features & llvm::AMDGPU::FEATURE_FAST_FMA_F32) && + (Features & llvm::AMDGPU::FEATURE_FAST_DENORMAL_F32)); + bool Wavefront64 = !(Features & llvm::AMDGPU::FEATURE_WAVE32); + bool FastRelaxedMath = CGM.getCodeGenOpts().GPUFastRelaxedMath; + bool FiniteOnly = false; + bool UnsafeMath = false; + bool CorrectSqrt = true; + + // Control constants for math operations. + AddGlobal("__oclc_daz_opt", DenormAtZero); + AddGlobal("__oclc_wavefrontsize64", Wavefront64); + AddGlobal("__oclc_finite_only_opt", FiniteOnly || FastRelaxedMath); + AddGlobal("__oclc_unsafe_math_opt", UnsafeMath || FastRelaxedMath); + AddGlobal("__oclc_correctly_rounded_sqrt32", CorrectSqrt); + + // Control constants for the system. + AddGlobal("__oclc_ISA_version", Minor + Major * 1000, 32); + AddGlobal("__oclc_ABI_version", 400, 32); +} + void AMDGPUTargetCodeGenInfo::setTargetAttributes( const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const { if (requiresAMDGPUProtectedVisibility(D, GV)) { diff --git a/clang/test/CodeGen/amdgcn-occl-constants.c b/clang/test/CodeGen/amdgcn-occl-constants.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/amdgcn-occl-constants.c @@ -0,0 +1,21 @@ +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx90a -S -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx90a -fgpu-fast-relaxed-math \ +// RUN: -S -emit-llvm -o - %s | FileCheck %s --check-prefix=FAST + +void foo() {} + +// CHECK: @__oclc_daz_opt = weak_odr hidden local_unnamed_addr addrspace(4) constant i8 0, align 1 +// CHECK: @__oclc_wavefrontsize64 = weak_odr hidden local_unnamed_addr addrspace(4) constant i8 1, align 1 +// CHECK: @__oclc_finite_only_opt = weak_odr hidden local_unnamed_addr addrspace(4) constant i8 0, align 1 +// CHECK: @__oclc_unsafe_math_opt = weak_odr hidden local_unnamed_addr addrspace(4) constant i8 0, align 1 +// CHECK: @__oclc_correctly_rounded_sqrt32 = weak_odr hidden local_unnamed_addr addrspace(4) constant i8 1, align 1 +// CHECK: @__oclc_ISA_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 9010, align 4 +// CHECK: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 400, align 4 + +// FAST: @__oclc_daz_opt = weak_odr hidden local_unnamed_addr addrspace(4) constant i8 0, align 1 +// FAST: @__oclc_wavefrontsize64 = weak_odr hidden local_unnamed_addr addrspace(4) constant i8 1, align 1 +// FAST: @__oclc_finite_only_opt = weak_odr hidden local_unnamed_addr addrspace(4) constant i8 1, align 1 +// FAST: @__oclc_unsafe_math_opt = weak_odr hidden local_unnamed_addr addrspace(4) constant i8 1, align 1 +// FAST: @__oclc_correctly_rounded_sqrt32 = weak_odr hidden local_unnamed_addr addrspace(4) constant i8 1, align 1 +// FAST: @__oclc_ISA_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 9010, align 4 +// FAST: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 400, align 4