diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -268,7 +268,7 @@ LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions") LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code") LANGOPT(GPUAllowDeviceInit, 1, 0, "allowing device side global init functions for HIP") -LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max threads per block for kernel launch bounds for HIP") +LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max threads per block for kernel launch bounds for OpenMP/HIP") LANGOPT(GPUDeferDiag, 1, 0, "defer host/device related diagnostic messages for CUDA/HIP") LANGOPT(GPUExcludeWrongSideOverloads, 1, 0, "always exclude wrong side overloads in overloading resolution for CUDA/HIP") LANGOPT(OffloadingNewDriver, 1, 0, "use the new driver for generating offloading code.") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1047,9 +1047,8 @@ BothFlags<[HelpHidden], " in overloading resolution for CUDA/HIP">>; def gpu_max_threads_per_block_EQ : Joined<["--"], "gpu-max-threads-per-block=">, Flags<[CC1Option]>, - HelpText<"Default max threads per block for kernel launch bounds for HIP">, - MarshallingInfoInt, "1024">, - ShouldParseIf; + HelpText<"Default max threads per block for kernel launch bounds for OpenMP/HIP">, + MarshallingInfoInt, "1024">; def fgpu_inline_threshold_EQ : Joined<["-"], "fgpu-inline-threshold=">, Flags<[HelpHidden]>, HelpText<"Inline threshold for device compilation for CUDA/HIP">; diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp --- a/clang/lib/CodeGen/TargetInfo.cpp +++ b/clang/lib/CodeGen/TargetInfo.cpp @@ -9549,6 +9549,10 @@ // TODO: This should be moved to language specific attributes instead. if (IsHIPKernel || IsOpenMPkernel) F->addFnAttr("uniform-work-group-size", "true"); + if (IsOpenMPkernel) + F->addFnAttr("amdgpu-flat-work-group-size", + std::string("1,") + + llvm::utostr(M.getLangOpts().GPUMaxThreadsPerBlock)); if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics()) F->addFnAttr("amdgpu-unsafe-fp-atomics", "true"); diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -788,7 +788,7 @@ if (DriverArgs.hasArg(options::OPT_nogpulib)) return; - // Get the device name and canonicalize it + // Get the ndevice name and canonicalize it const StringRef GpuArch = getGPUArch(DriverArgs); auto Kind = llvm::AMDGPU::parseArchAMDGCN(GpuArch); const StringRef CanonArch = llvm::AMDGPU::getArchNameAMDGCN(Kind); diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp --- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp @@ -54,6 +54,12 @@ CC1Args.push_back(DriverArgs.MakeArgStringRef(GPUArch)); CC1Args.push_back("-fcuda-is-device"); + if (DriverArgs.hasArg(options::OPT_gpu_max_threads_per_block_EQ)) + CC1Args.push_back(DriverArgs.MakeArgString( + "--gpu-max-threads-per-block=" + + DriverArgs.getLastArgValue(options::OPT_gpu_max_threads_per_block_EQ, + "1024"))); + if (DriverArgs.hasArg(options::OPT_nogpulib)) return; diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp --- a/clang/lib/Driver/ToolChains/HIPAMD.cpp +++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp @@ -233,13 +233,11 @@ false)) CC1Args.append({"-mllvm", "-amdgpu-internalize-symbols"}); - StringRef MaxThreadsPerBlock = - DriverArgs.getLastArgValue(options::OPT_gpu_max_threads_per_block_EQ); - if (!MaxThreadsPerBlock.empty()) { - std::string ArgStr = - (Twine("--gpu-max-threads-per-block=") + MaxThreadsPerBlock).str(); - CC1Args.push_back(DriverArgs.MakeArgStringRef(ArgStr)); - } + if (DriverArgs.hasArg(options::OPT_gpu_max_threads_per_block_EQ)) + CC1Args.push_back(DriverArgs.MakeArgString( + "--gpu-max-threads-per-block=" + + DriverArgs.getLastArgValue(options::OPT_gpu_max_threads_per_block_EQ, + "1024"))); CC1Args.push_back("-fcuda-allow-variadic-functions"); diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -518,7 +518,8 @@ Diags.Report(diag::warn_ignored_hip_only_option) << Args.getLastArg(OPT_fgpu_allow_device_init)->getAsString(Args); - if (Args.hasArg(OPT_gpu_max_threads_per_block_EQ) && !LangOpts.HIP) + if (Args.hasArg(OPT_gpu_max_threads_per_block_EQ) && !LangOpts.HIP && + !LangOpts.OpenMPIsDevice) Diags.Report(diag::warn_ignored_hip_only_option) << Args.getLastArg(OPT_gpu_max_threads_per_block_EQ)->getAsString(Args); diff --git a/clang/test/Driver/openmp-offload-gpu.c b/clang/test/Driver/openmp-offload-gpu.c --- a/clang/test/Driver/openmp-offload-gpu.c +++ b/clang/test/Driver/openmp-offload-gpu.c @@ -373,3 +373,13 @@ // RUN: | FileCheck --check-prefix=XARCH-DEVICE %s // XARCH-DEVICE: "-cc1" "-triple" "nvptx64-nvidia-cuda"{{.*}}"-O3" // XARCH-DEVICE-NOT: "-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-O3" + +// +// Check that `--gpu-max-threads-per-block` works for AMDGPU OpenMP offloading. +// +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp \ +// RUN: -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx1030 \ +// RUN: -nogpulib --gpu-max-threads-per-block=512 %s 2>&1 \ +// RUN: | FileCheck --check-prefix=AMD-MAX-THREADS %s +// AMD-MAX-THREADS: "-cc1" {{.*}} "--gpu-max-threads-per-block=512" +// AMD-MAX-THREADS-SAME: "-fopenmp-is-device" diff --git a/clang/test/OpenMP/amdgcn-attributes.cpp b/clang/test/OpenMP/amdgcn-attributes.cpp --- a/clang/test/OpenMP/amdgcn-attributes.cpp +++ b/clang/test/OpenMP/amdgcn-attributes.cpp @@ -2,6 +2,7 @@ // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefixes=DEFAULT,ALL %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s --gpu-max-threads-per-block=512 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefixes=MAX-THREADS %s // RUN: %clang_cc1 -target-cpu gfx900 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefixes=CPU,ALL %s // RUN: %clang_cc1 -menable-no-nans -mno-amdgpu-ieee -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefixes=NOIEEE,ALL %s @@ -32,12 +33,14 @@ return x + 1; } -// DEFAULT: attributes #0 = { convergent noinline norecurse nounwind optnone "kernel" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } -// CPU: attributes #0 = { convergent noinline norecurse nounwind optnone "kernel" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" } -// NOIEEE: attributes #0 = { convergent noinline norecurse nounwind optnone "amdgpu-ieee"="false" "kernel" "no-nans-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } -// UNSAFEATOMIC: attributes #0 = { convergent noinline norecurse nounwind optnone "amdgpu-unsafe-fp-atomics"="true" "kernel" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } +// DEFAULT: attributes #0 = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "kernel" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } +// MAX-THREADS: attributes #0 = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,512" "kernel" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } +// CPU: attributes #0 = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "kernel" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" } +// NOIEEE: attributes #0 = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "amdgpu-ieee"="false" "kernel" "no-nans-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } +// UNSAFEATOMIC: attributes #0 = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "amdgpu-unsafe-fp-atomics"="true" "kernel" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } // DEFAULT: attributes #1 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// MAX-THREADS: attributes #1 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } // CPU: attributes #1 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } // NOIEEE: attributes #1 = { convergent mustprogress noinline nounwind optnone "amdgpu-ieee"="false" "no-nans-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } // UNSAFEATOMIC: attributes #1 = { convergent mustprogress noinline nounwind optnone "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }