Index: clang/include/clang/Basic/LangOptions.def =================================================================== --- clang/include/clang/Basic/LangOptions.def +++ clang/include/clang/Basic/LangOptions.def @@ -278,6 +278,7 @@ ENUM_LANGOPT(SYCLVersion , SYCLMajorVersion, 2, SYCL_None, "Version of the SYCL standard used") LANGOPT(HIPUseNewLaunchAPI, 1, 0, "Use new kernel launching API for HIP") +LANGOPT(OffloadUniformBlock, 1, 0, "Assume that kernels are launched with uniform block sizes (default true for CUDA/HIP and false otherwise)") LANGOPT(SizedDeallocation , 1, 0, "sized deallocation") LANGOPT(AlignedAllocation , 1, 0, "aligned allocation") Index: clang/include/clang/Driver/Options.td =================================================================== --- clang/include/clang/Driver/Options.td +++ clang/include/clang/Driver/Options.td @@ -1089,6 +1089,10 @@ NegFlag, BothFlags<[], " that kernel argument names are preserved (HIP only)">>, ShouldParseIf; +defm offload_uniform_block : BoolFOption<"offload-uniform-block", + LangOpts<"OffloadUniformBlock">, Default<"LangOpts->CUDA">, + PosFlag, NegFlag, + BothFlags<[], " that kernels are launched with uniform block sizes (default true for CUDA/HIP and false otherwise)">>; def hipspv_pass_plugin_EQ : Joined<["--"], "hipspv-pass-plugin=">, Group, MetaVarName<"">, HelpText<"path to a pass plugin for HIP to SPIR-V passes.">; Index: clang/lib/CodeGen/CGCall.cpp =================================================================== --- clang/lib/CodeGen/CGCall.cpp +++ clang/lib/CodeGen/CGCall.cpp @@ -2402,6 +2402,10 @@ llvm::toStringRef(CodeGenOpts.UniformWGSize)); } } + + if (TargetDecl->hasAttr() && + getLangOpts().OffloadUniformBlock) + FuncAttrs.addAttribute("uniform-work-group-size", "true"); } // Attach "no-builtins" attributes to: Index: clang/lib/CodeGen/Targets/AMDGPU.cpp =================================================================== --- clang/lib/CodeGen/Targets/AMDGPU.cpp +++ clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -401,13 +401,6 @@ if (FD) setFunctionDeclAttributes(FD, F, M); - const bool IsHIPKernel = - M.getLangOpts().HIP && FD && FD->hasAttr(); - - // TODO: This should be moved to language specific attributes instead. - if (IsHIPKernel) - F->addFnAttr("uniform-work-group-size", "true"); - if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics()) F->addFnAttr("amdgpu-unsafe-fp-atomics", "true"); Index: clang/lib/Driver/ToolChains/Clang.cpp =================================================================== --- clang/lib/Driver/ToolChains/Clang.cpp +++ clang/lib/Driver/ToolChains/Clang.cpp @@ -7264,6 +7264,9 @@ Args.AddLastArg(CmdArgs, options::OPT_fgpu_default_stream_EQ); } + Args.AddLastArg(CmdArgs, options::OPT_foffload_uniform_block, + options::OPT_fno_offload_uniform_block); + if (IsCudaDevice || IsHIPDevice) { StringRef InlineThresh = Args.getLastArgValue(options::OPT_fgpu_inline_threshold_EQ); Index: clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu =================================================================== --- clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu +++ clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu @@ -10,10 +10,18 @@ // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm \ // RUN: -verify -o - -x hip %s | FileCheck -check-prefix=NAMD %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -foffload-uniform-block \ +// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefixes=CHECK,DEFAULT %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fno-offload-uniform-block \ +// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefixes=NOUB %s + #include "Inputs/cuda.h" __global__ void flat_work_group_size_default() { // CHECK: define{{.*}} amdgpu_kernel void @_Z28flat_work_group_size_defaultv() [[FLAT_WORK_GROUP_SIZE_DEFAULT:#[0-9]+]] +// NOUB: define{{.*}} void @_Z28flat_work_group_size_defaultv() [[NOUB:#[0-9]+]] } __attribute__((amdgpu_flat_work_group_size(32, 64))) // expected-no-diagnostics @@ -45,3 +53,5 @@ // CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2" // CHECK-DAG: attributes [[NUM_SGPR_32]] = {{.*}}"amdgpu-num-sgpr"="32" // CHECK-DAG: attributes [[NUM_VGPR_64]] = {{.*}}"amdgpu-num-vgpr"="64" + +// NOUB-NOT: "uniform-work-group-size"="true" Index: clang/test/Driver/hip-options.hip =================================================================== --- clang/test/Driver/hip-options.hip +++ clang/test/Driver/hip-options.hip @@ -205,3 +205,27 @@ // RUN: %clang -fdriver-only -Werror --target=x86_64-unknown-linux-gnu -nostdinc -nostdlib -fgpu-approx-transcendentals \ // RUN: -x c++ %s 2>&1 | count 0 +/ Check -fno-offload-uniform-block is passed to clang -cc1 but +// (default) -fno-offload-uniform-block is not. + +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -nogpuinc -nogpulib -fno-offload-uniform-block \ +// RUN: --cuda-gpu-arch=gfx906 %s 2>&1 | FileCheck -check-prefix=NOUNIBLK %s + +// NOUNIBLK: "-cc1"{{.*}} "-triple" "amdgcn-amd-amdhsa" {{.*}} "-fno-offload-uniform-block" +// NOUNIBLK: "-cc1"{{.*}} "-triple" "x86_64-unknown-linux-gnu" {{.*}} "-fno-offload-uniform-block" + +// RUN: %clang -### -nogpuinc -nogpulib -foffload-uniform-block \ +// RUN: --cuda-gpu-arch=gfx906 %s 2>&1 | FileCheck -check-prefix=UNIBLK %s + +// UNIBLK: "-cc1"{{.*}} "-triple" "amdgcn-amd-amdhsa" {{.*}} "-foffload-uniform-block" +// UNIBLK: "-cc1"{{.*}} "-triple" "x86_64-unknown-linux-gnu" {{.*}} "-foffload-uniform-block" + +// RUN: %clang -### -nogpuinc -nogpulib \ +// RUN: --cuda-gpu-arch=gfx906 %s 2>&1 | FileCheck -check-prefix=DEFUNIBLK %s + +// DEFUNIBLK-NOT: "-f{{(no-)?}}offload-uniform-block" + +// Check no warnings for -f[no-]offload-uniform-block. + +// RUN: %clang -fdriver-only -Werror --target=x86_64-unknown-linux-gnu -nogpuinc -nogpulib -fno-offload-uniform-block \ +// RUN: -foffload-uniform-block --cuda-gpu-arch=gfx906 %s 2>&1 | count 0