diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -2025,7 +2025,8 @@ llvm::AttrBuilder FuncAttrs(F.getContext()); getTrivialDefaultFunctionAttributes(F.getName(), F.hasOptNone(), /*AttrOnCallSite=*/false, FuncAttrs); - GetCPUAndFeaturesAttributes(GlobalDecl(), FuncAttrs); + GetCPUAndFeaturesAttributes(GlobalDecl(), FuncAttrs, + /*AddTargetFeatures=*/false); if (!WillInternalize && F.isInterposable()) { // Do not promote "dynamic" denormal-fp-math to this translation unit's diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h --- a/clang/lib/CodeGen/CodeGenModule.h +++ b/clang/lib/CodeGen/CodeGenModule.h @@ -1583,7 +1583,8 @@ ForDefinition_t IsForDefinition = NotForDefinition); bool GetCPUAndFeaturesAttributes(GlobalDecl GD, - llvm::AttrBuilder &AttrBuilder); + llvm::AttrBuilder &AttrBuilder, + bool SetTargetFeatures = true); void setNonAliasAttributes(GlobalDecl GD, llvm::GlobalObject *GO); /// Set function attributes for a function declaration. diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -2226,7 +2226,8 @@ } bool CodeGenModule::GetCPUAndFeaturesAttributes(GlobalDecl GD, - llvm::AttrBuilder &Attrs) { + llvm::AttrBuilder &Attrs, + bool SetTargetFeatures) { // Add target-cpu and target-features attributes to functions. If // we have a decl for the function and it has a target attribute then // parse that and add it to the feature set. @@ -2286,7 +2287,7 @@ Attrs.addAttribute("tune-cpu", TuneCPU); AddedAttr = true; } - if (!Features.empty()) { + if (!Features.empty() && SetTargetFeatures) { llvm::sort(Features); Attrs.addAttribute("target-features", llvm::join(Features, ",")); AddedAttr = true; diff --git a/clang/test/CodeGenCUDA/Inputs/ocml-sample-target-attrs.cl b/clang/test/CodeGenCUDA/Inputs/ocml-sample-target-attrs.cl new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenCUDA/Inputs/ocml-sample-target-attrs.cl @@ -0,0 +1,7 @@ +typedef unsigned long ulong; + +__attribute__((target("gfx11-insts"))) +ulong do_intrin_stuff(void) +{ + return __builtin_amdgcn_s_sendmsg_rtnl(0x0); +} diff --git a/clang/test/CodeGenCUDA/link-builtin-bitcode-denormal-fp-mode.cu b/clang/test/CodeGenCUDA/link-builtin-bitcode-denormal-fp-mode.cu --- a/clang/test/CodeGenCUDA/link-builtin-bitcode-denormal-fp-mode.cu +++ b/clang/test/CodeGenCUDA/link-builtin-bitcode-denormal-fp-mode.cu @@ -132,24 +132,32 @@ // Default mode relies on the implicit check-not for the denormal-fp-math. -// PSZ: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" {{.*}} "target-cpu"="gfx803" {{.*}} } -// PSZ: #[[$FUNCATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign" {{.*}} "target-cpu"="gfx803" {{.*}} } -// PSZ: #[[$WEAK_FUNCATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign" {{.*}} "target-cpu"="gfx803" {{.*}} } +// PSZ: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" +// PSZ-SAME: "target-cpu"="gfx803" +// PSZ: #[[$FUNCATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign" +// PSZ-SAME: "target-cpu"="gfx803" +// PSZ: #[[$WEAK_FUNCATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign" +// PSZ-SAME: "target-cpu"="gfx803" // FIXME: Should check-not "denormal-fp-math" within the line -// IEEEF64-PSZF32: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign" {{.*}} "target-cpu"="gfx803" {{.*}} } -// IEEEF64-PSZF32: #[[$FUNCATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign" {{.*}} "target-cpu"="gfx803" {{.*}} } -// IEEEF64-PSZF32: #[[$WEAK_FUNCATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign" {{.*}} "target-cpu"="gfx803" {{.*}} } +// IEEEF64-PSZF32: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign" +// IEEEF64-PSZF32-SAME: "target-cpu"="gfx803" +// IEEEF64-PSZF32: #[[$FUNCATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign" +// IEEEF64-PSZF32-SAME: "target-cpu"="gfx803" +// IEEEF64-PSZF32: #[[$WEAK_FUNCATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign" +// IEEEF64-PSZF32-SAME: "target-cpu"="gfx803" // IEEEF32-PSZF64-DYNF32: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" {{.*}} "target-cpu"="gfx803" {{.*}} } // implicit check-not // implicit check-not -// IEEEF32-PSZF64-DYNFULL: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" {{.*}} "target-cpu"="gfx803" {{.*}} } -// IEEEF32-PSZF64-DYNFULL: #[[$FUNCATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" {{.*}} "target-cpu"="gfx803" {{.*}} } -// IEEEF32-PSZF64-DYNFULL: #[[$WEAK_FUNCATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" {{.*}} "target-cpu"="gfx803" {{.*}} } - +// IEEEF32-PSZF64-DYNFULL: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" +// IEEEF32-PSZF64-DYNFULL-SAME: "target-cpu"="gfx803" +// IEEEF32-PSZF64-DYNFULL: #[[$FUNCATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" +// IEEEF32-PSZF64-DYNFULL-SAME: "target-cpu"="gfx803" +// IEEEF32-PSZF64-DYNFULL: #[[$WEAK_FUNCATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" +// IEEEF32-PSZF64-DYNFULL-SAME: "target-cpu"="gfx803" // -mlink-bitcode-file doesn't internalize or propagate attributes. // NOINTERNALIZE-IEEEF32-PSZF64-DYNFULL: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" {{.*}} "target-cpu"="gfx803" {{.*}} } diff --git a/clang/test/CodeGenCUDA/link-builtin-bitcode-gpu-attrs-preserved.cu b/clang/test/CodeGenCUDA/link-builtin-bitcode-gpu-attrs-preserved.cu new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenCUDA/link-builtin-bitcode-gpu-attrs-preserved.cu @@ -0,0 +1,48 @@ +// Verify the behavior of the +gfxN-insts in the way that +// rocm-device-libs should be built with. e.g. If the device libraries has a function +// with "+gfx11-insts", that attribute should still be present after linking and not +// overwritten with the current target's settings. + +// This is important because at this time, many device-libs functions that are only +// available on some GPUs put an attribute such as "+gfx11-insts" so that +// AMDGPURemoveIncompatibleFunctions can detect & remove them if needed. + +// Build the fake device library in the way rocm-device-libs should be built. +// +// RUN: %clang_cc1 -x cl -triple amdgcn-amd-amdhsa\ +// RUN: -mcode-object-version=none -emit-llvm-bc \ +// RUN: %S/Inputs/ocml-sample-target-attrs.cl -o %t.bc + +// Check the default behavior +// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx803 -fcuda-is-device \ +// RUN: -mlink-builtin-bitcode %t.bc \ +// RUN: -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,INTERNALIZE + +// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx1101 -fcuda-is-device \ +// RUN: -mlink-builtin-bitcode %t.bc -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,INTERNALIZE + +// Check the case where no internalization is performed +// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx803 \ +// RUN: -fcuda-is-device -mlink-bitcode-file %t.bc -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,NOINTERNALIZE + +// Check the case where no internalization is performed +// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx1101 \ +// RUN: -fcuda-is-device -mlink-bitcode-file %t.bc -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,NOINTERNALIZE + + +// CHECK: define {{.*}} i64 @do_intrin_stuff() #[[ATTR:[0-9]+]] +// INTERNALIZE: attributes #[[ATTR]] = {{.*}} "target-cpu"="gfx{{.*}}" "target-features"="+gfx11-insts" +// NOINTERNALIZE: attributes #[[ATTR]] = {{.*}} "target-features"="+gfx11-insts" + +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) + +typedef unsigned long ulong; + +extern "C" { +__device__ ulong do_intrin_stuff(void); + +__global__ void kernel_f16(ulong* out) { + *out = do_intrin_stuff(); + } +}