Index: clang/include/clang/Basic/LangOptions.def =================================================================== --- clang/include/clang/Basic/LangOptions.def +++ clang/include/clang/Basic/LangOptions.def @@ -227,6 +227,7 @@ LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions") LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code") LANGOPT(GPUAllowDeviceInit, 1, 0, "allowing device side global init functions for HIP") +LANGOPT(GPUMaxThreadsPerBlock, 32, 256, "default max threads per block for kernel launch bounds for HIP") LANGOPT(SYCLIsDevice , 1, 0, "Generate code for SYCL device") Index: clang/include/clang/Driver/Options.td =================================================================== --- clang/include/clang/Driver/Options.td +++ clang/include/clang/Driver/Options.td @@ -606,6 +606,9 @@ def fgpu_allow_device_init : Flag<["-"], "fgpu-allow-device-init">, Flags<[CC1Option]>, HelpText<"Allow device side init function in HIP">; def fno_gpu_allow_device_init : Flag<["-"], "fno-gpu-allow-device-init">; +def gpu_max_threads_per_block_EQ : Joined<["--"], "gpu-max-threads-per-block=">, + Flags<[CC1Option]>, + HelpText<"Default max threads per block for kernel launch bounds for HIP">; def libomptarget_nvptx_path_EQ : Joined<["--"], "libomptarget-nvptx-path=">, Group, HelpText<"Path to libomptarget-nvptx libraries">; def dD : Flag<["-"], "dD">, Group, Flags<[CC1Option]>, Index: clang/lib/CodeGen/TargetInfo.cpp =================================================================== --- clang/lib/CodeGen/TargetInfo.cpp +++ clang/lib/CodeGen/TargetInfo.cpp @@ -8068,8 +8068,11 @@ } else assert(Max == 0 && "Max must be zero"); } else if (IsOpenCLKernel || IsHIPKernel) { - // By default, restrict the maximum size to 256. - F->addFnAttr("amdgpu-flat-work-group-size", "1,256"); + // By default, restrict the maximum size to a value specified by + // --gpu-max-threads-per-block=n or its default value. + std::string AttrVal = + std::string("1,") + llvm::utostr(M.getLangOpts().GPUMaxThreadsPerBlock); + F->addFnAttr("amdgpu-flat-work-group-size", AttrVal); } if (const auto *Attr = FD->getAttr()) { Index: clang/lib/Driver/ToolChains/HIP.cpp =================================================================== --- clang/lib/Driver/ToolChains/HIP.cpp +++ clang/lib/Driver/ToolChains/HIP.cpp @@ -307,6 +307,14 @@ false)) CC1Args.push_back("-fgpu-rdc"); + StringRef MaxThreadsPerBlock = + DriverArgs.getLastArgValue(options::OPT_gpu_max_threads_per_block_EQ); + if (!MaxThreadsPerBlock.empty()) { + std::string ArgStr = + std::string("--gpu-max-threads-per-block=") + MaxThreadsPerBlock.str(); + CC1Args.push_back(DriverArgs.MakeArgStringRef(ArgStr)); + } + if (DriverArgs.hasFlag(options::OPT_fgpu_allow_device_init, options::OPT_fno_gpu_allow_device_init, false)) CC1Args.push_back("-fgpu-allow-device-init"); Index: clang/lib/Frontend/CompilerInvocation.cpp =================================================================== --- clang/lib/Frontend/CompilerInvocation.cpp +++ clang/lib/Frontend/CompilerInvocation.cpp @@ -2559,6 +2559,12 @@ << Args.getLastArg(OPT_fgpu_allow_device_init)->getAsString(Args); } Opts.HIPUseNewLaunchAPI = Args.hasArg(OPT_fhip_new_launch_api); + if (Opts.HIP) + Opts.GPUMaxThreadsPerBlock = getLastArgIntValue( + Args, OPT_gpu_max_threads_per_block_EQ, Opts.GPUMaxThreadsPerBlock); + else if (Args.hasArg(OPT_gpu_max_threads_per_block_EQ)) + Diags.Report(diag::warn_ignored_hip_only_option) + << Args.getLastArg(OPT_gpu_max_threads_per_block_EQ)->getAsString(Args); if (Opts.ObjC) { if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) { Index: clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu =================================================================== --- clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu +++ clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu @@ -1,13 +1,21 @@ // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \ -// RUN: -fcuda-is-device -emit-llvm -o - %s | FileCheck %s +// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefixes=CHECK,DEFAULT %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa --gpu-max-threads-per-block=1024 \ +// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefixes=CHECK,MAX1024 %s // RUN: %clang_cc1 -triple nvptx \ // RUN: -fcuda-is-device -emit-llvm -o - %s | FileCheck %s \ // RUN: -check-prefix=NAMD // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm \ -// RUN: -verify -o - %s | FileCheck -check-prefix=NAMD %s +// RUN: -verify -o - -x hip %s | FileCheck -check-prefix=NAMD %s #include "Inputs/cuda.h" +__global__ void flat_work_group_size_default() { +// CHECK: define amdgpu_kernel void @_Z28flat_work_group_size_defaultv() [[FLAT_WORK_GROUP_SIZE_DEFAULT:#[0-9]+]] +} + __attribute__((amdgpu_flat_work_group_size(32, 64))) // expected-no-diagnostics __global__ void flat_work_group_size_32_64() { // CHECK: define amdgpu_kernel void @_Z26flat_work_group_size_32_64v() [[FLAT_WORK_GROUP_SIZE_32_64:#[0-9]+]] @@ -31,7 +39,9 @@ // NAMD-NOT: "amdgpu-num-vgpr" // NAMD-NOT: "amdgpu-num-sgpr" -// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = { convergent noinline nounwind optnone "amdgpu-flat-work-group-size"="32,64" -// CHECK-DAG: attributes [[WAVES_PER_EU_2]] = { convergent noinline nounwind optnone "amdgpu-waves-per-eu"="2" -// CHECK-DAG: attributes [[NUM_SGPR_32]] = { convergent noinline nounwind optnone "amdgpu-num-sgpr"="32" -// CHECK-DAG: attributes [[NUM_VGPR_64]] = { convergent noinline nounwind optnone "amdgpu-num-vgpr"="64" +// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,256" +// MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024" +// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = {{.*}}"amdgpu-flat-work-group-size"="32,64" +// CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2" +// CHECK-DAG: attributes [[NUM_SGPR_32]] = {{.*}}"amdgpu-num-sgpr"="32" +// CHECK-DAG: attributes [[NUM_VGPR_64]] = {{.*}}"amdgpu-num-vgpr"="64" Index: clang/test/Driver/hip-options.hip =================================================================== --- /dev/null +++ clang/test/Driver/hip-options.hip @@ -0,0 +1,10 @@ +// REQUIRES: clang-driver +// REQUIRES: x86-registered-target +// REQUIRES: amdgpu-registered-target + +// RUN: %clang -### -x hip --gpu-max-threads-per-block=1024 %s 2>&1 | FileCheck %s + +// Check that there are commands for both host- and device-side compilations. +// +// CHECK: clang{{.*}}" "-cc1" {{.*}} "-fcuda-is-device" +// CHECK-SAME: "--gpu-max-threads-per-block=1024"