diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -202,7 +202,6 @@ /// float-to-int conversion instructions. CODEGENOPT(StrictFloatCastOverflow, 1, 1) -CODEGENOPT(UniformWGSize , 1, 0) ///< -cl-uniform-work-group-size CODEGENOPT(NoZeroInitializedInBSS , 1, 0) ///< -fno-zero-initialized-in-bss. /// Method of Objective-C dispatch to use. ENUM_CODEGENOPT(ObjCDispatchMethod, ObjCDispatchMethodKind, 2, Legacy) diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -278,6 +278,7 @@ ENUM_LANGOPT(SYCLVersion , SYCLMajorVersion, 2, SYCL_None, "Version of the SYCL standard used") LANGOPT(HIPUseNewLaunchAPI, 1, 0, "Use new kernel launching API for HIP") +LANGOPT(OffloadUniformBlock, 1, 0, "Assume that kernels are launched with uniform block sizes (default true for CUDA/HIP and false otherwise)") LANGOPT(SizedDeallocation , 1, 0, "sized deallocation") LANGOPT(AlignedAllocation , 1, 0, "aligned allocation") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -912,6 +912,12 @@ def b : JoinedOrSeparate<["-"], "b">, Flags<[LinkerInput]>, HelpText<"Pass -b to the linker on AIX">, MetaVarName<"">, Group; + +defm offload_uniform_block : BoolFOption<"offload-uniform-block", + LangOpts<"OffloadUniformBlock">, Default<"LangOpts->CUDA">, + PosFlag, NegFlag, + BothFlags<[], " that kernels are launched with uniform block sizes (default true for CUDA/HIP and false otherwise)">>; + // OpenCL-only Options def cl_opt_disable : Flag<["-"], "cl-opt-disable">, Group, Flags<[CC1Option]>, HelpText<"OpenCL only. This option disables all optimizations. By default optimizations are enabled.">; @@ -947,9 +953,8 @@ def cl_fp32_correctly_rounded_divide_sqrt : Flag<["-"], "cl-fp32-correctly-rounded-divide-sqrt">, Group, Flags<[CC1Option]>, HelpText<"OpenCL only. Specify that single precision floating-point divide and sqrt used in the program source are correctly rounded.">, MarshallingInfoFlag>; -def cl_uniform_work_group_size : Flag<["-"], "cl-uniform-work-group-size">, Group, Flags<[CC1Option]>, - HelpText<"OpenCL only. Defines that the global work-size be a multiple of the work-group size specified to clEnqueueNDRangeKernel">, - MarshallingInfoFlag>; +def cl_uniform_work_group_size : Flag<["-"], "cl-uniform-work-group-size">, Group, Flags<[CC1Option]>, Alias, + HelpText<"OpenCL only. Defines that the global work-size be a multiple of the work-group size specified to clEnqueueNDRangeKernel">; def cl_no_stdinc : Flag<["-"], "cl-no-stdinc">, Group, HelpText<"OpenCL only. Disables all standard includes containing non-native compiler types and functions.">; def cl_ext_EQ : CommaJoined<["-"], "cl-ext=">, Group, Flags<[CC1Option]>, diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -2397,10 +2397,15 @@ // to the compiler that the global work-size be a multiple of // the work-group size specified to clEnqueueNDRangeKernel // (i.e. work groups are uniform). - FuncAttrs.addAttribute("uniform-work-group-size", - llvm::toStringRef(CodeGenOpts.UniformWGSize)); + FuncAttrs.addAttribute( + "uniform-work-group-size", + llvm::toStringRef(getLangOpts().OffloadUniformBlock)); } } + + if (TargetDecl->hasAttr() && + getLangOpts().OffloadUniformBlock) + FuncAttrs.addAttribute("uniform-work-group-size", "true"); } // Attach "no-builtins" attributes to: diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -366,13 +366,6 @@ if (FD) setFunctionDeclAttributes(FD, F, M); - const bool IsHIPKernel = - M.getLangOpts().HIP && FD && FD->hasAttr(); - - // TODO: This should be moved to language specific attributes instead. - if (IsHIPKernel) - F->addFnAttr("uniform-work-group-size", "true"); - if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics()) F->addFnAttr("amdgpu-unsafe-fp-atomics", "true"); diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -7280,6 +7280,9 @@ Args.AddLastArg(CmdArgs, options::OPT_fgpu_default_stream_EQ); } + Args.AddLastArg(CmdArgs, options::OPT_foffload_uniform_block, + options::OPT_fno_offload_uniform_block); + if (IsCudaDevice || IsHIPDevice) { StringRef InlineThresh = Args.getLastArgValue(options::OPT_fgpu_inline_threshold_EQ); diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu --- a/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu +++ b/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu @@ -10,10 +10,18 @@ // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm \ // RUN: -verify -o - -x hip %s | FileCheck -check-prefix=NAMD %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -foffload-uniform-block \ +// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefixes=CHECK,DEFAULT %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fno-offload-uniform-block \ +// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefixes=NOUB %s + #include "Inputs/cuda.h" __global__ void flat_work_group_size_default() { // CHECK: define{{.*}} amdgpu_kernel void @_Z28flat_work_group_size_defaultv() [[FLAT_WORK_GROUP_SIZE_DEFAULT:#[0-9]+]] +// NOUB: define{{.*}} void @_Z28flat_work_group_size_defaultv() [[NOUB:#[0-9]+]] } __attribute__((amdgpu_flat_work_group_size(32, 64))) // expected-no-diagnostics @@ -45,3 +53,5 @@ // CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2" // CHECK-DAG: attributes [[NUM_SGPR_32]] = {{.*}}"amdgpu-num-sgpr"="32" // CHECK-DAG: attributes [[NUM_VGPR_64]] = {{.*}}"amdgpu-num-vgpr"="64" + +// NOUB-NOT: "uniform-work-group-size"="true" diff --git a/clang/test/CodeGenOpenCL/cl-uniform-wg-size.cl b/clang/test/CodeGenOpenCL/cl-uniform-wg-size.cl --- a/clang/test/CodeGenOpenCL/cl-uniform-wg-size.cl +++ b/clang/test/CodeGenOpenCL/cl-uniform-wg-size.cl @@ -1,6 +1,7 @@ // RUN: %clang_cc1 -emit-llvm -O0 -cl-std=CL1.2 -o - %s 2>&1 | FileCheck %s -check-prefixes CHECK,CHECK-UNIFORM // RUN: %clang_cc1 -emit-llvm -O0 -cl-std=CL2.0 -o - %s 2>&1 | FileCheck %s -check-prefixes CHECK,CHECK-NONUNIFORM // RUN: %clang_cc1 -emit-llvm -O0 -cl-std=CL2.0 -cl-uniform-work-group-size -o - %s 2>&1 | FileCheck %s -check-prefixes CHECK,CHECK-UNIFORM +// RUN: %clang_cc1 -emit-llvm -O0 -cl-std=CL2.0 -foffload-uniform-block -o - %s 2>&1 | FileCheck %s -check-prefixes CHECK,CHECK-UNIFORM kernel void ker() {}; // CHECK: define{{.*}}@ker() #0 diff --git a/clang/test/Driver/hip-options.hip b/clang/test/Driver/hip-options.hip --- a/clang/test/Driver/hip-options.hip +++ b/clang/test/Driver/hip-options.hip @@ -205,3 +205,27 @@ // RUN: %clang -fdriver-only -Werror --target=x86_64-unknown-linux-gnu -nostdinc -nostdlib -fgpu-approx-transcendentals \ // RUN: -x c++ %s 2>&1 | count 0 +/ Check -fno-offload-uniform-block is passed to clang -cc1 but +// (default) -fno-offload-uniform-block is not. + +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -nogpuinc -nogpulib -fno-offload-uniform-block \ +// RUN: --cuda-gpu-arch=gfx906 %s 2>&1 | FileCheck -check-prefix=NOUNIBLK %s + +// NOUNIBLK: "-cc1"{{.*}} "-triple" "amdgcn-amd-amdhsa" {{.*}} "-fno-offload-uniform-block" +// NOUNIBLK: "-cc1"{{.*}} "-triple" "x86_64-unknown-linux-gnu" {{.*}} "-fno-offload-uniform-block" + +// RUN: %clang -### -nogpuinc -nogpulib -foffload-uniform-block \ +// RUN: --cuda-gpu-arch=gfx906 %s 2>&1 | FileCheck -check-prefix=UNIBLK %s + +// UNIBLK: "-cc1"{{.*}} "-triple" "amdgcn-amd-amdhsa" {{.*}} "-foffload-uniform-block" +// UNIBLK: "-cc1"{{.*}} "-triple" "x86_64-unknown-linux-gnu" {{.*}} "-foffload-uniform-block" + +// RUN: %clang -### -nogpuinc -nogpulib \ +// RUN: --cuda-gpu-arch=gfx906 %s 2>&1 | FileCheck -check-prefix=DEFUNIBLK %s + +// DEFUNIBLK-NOT: "-f{{(no-)?}}offload-uniform-block" + +// Check no warnings for -f[no-]offload-uniform-block. + +// RUN: %clang -fdriver-only -Werror --target=x86_64-unknown-linux-gnu -nogpuinc -nogpulib -fno-offload-uniform-block \ +// RUN: -foffload-uniform-block --cuda-gpu-arch=gfx906 %s 2>&1 | count 0 diff --git a/clang/test/Driver/opencl.cl b/clang/test/Driver/opencl.cl --- a/clang/test/Driver/opencl.cl +++ b/clang/test/Driver/opencl.cl @@ -17,6 +17,8 @@ // RUN: %clang -S -### -cl-denorms-are-zero %s 2>&1 | FileCheck --check-prefix=CHECK-DENORMS-ARE-ZERO %s // RUN: %clang -S -### -cl-fp32-correctly-rounded-divide-sqrt %s 2>&1 | FileCheck --check-prefix=CHECK-ROUND-DIV %s // RUN: %clang -S -### -cl-uniform-work-group-size %s 2>&1 | FileCheck --check-prefix=CHECK-UNIFORM-WG %s +// RUN: %clang -S -### -foffload-uniform-block %s 2>&1 | FileCheck --check-prefix=CHECK-UNIFORM-WG %s +// RUN: %clang -S -### -fno-offload-uniform-block -cl-uniform-work-group-size %s 2>&1 | FileCheck --check-prefix=CHECK-UNIFORM-WG %s // RUN: not %clang -cl-std=c99 -DOPENCL %s 2>&1 | FileCheck --check-prefix=CHECK-C99 %s // RUN: not %clang -cl-std=invalid -DOPENCL %s 2>&1 | FileCheck --check-prefix=CHECK-INVALID %s // RUN: %clang -S -### -target spir-unknown-unknown %s 2>&1 | FileCheck --check-prefix=CHECK-W-SPIR-COMPAT %s @@ -44,7 +46,7 @@ // CHECK-DENORMS-ARE-ZERO-NOT: "-cl-denorms-are-zero" // CHECK-ROUND-DIV: "-cc1" {{.*}} "-cl-fp32-correctly-rounded-divide-sqrt" -// CHECK-UNIFORM-WG: "-cc1" {{.*}} "-cl-uniform-work-group-size" +// CHECK-UNIFORM-WG: "-cc1" {{.*}} "-foffload-uniform-block" // CHECK-C99: error: invalid value 'c99' in '-cl-std=c99' // CHECK-INVALID: error: invalid value 'invalid' in '-cl-std=invalid'