Index: clang/include/clang/Basic/LangOptions.def =================================================================== --- clang/include/clang/Basic/LangOptions.def +++ clang/include/clang/Basic/LangOptions.def @@ -278,6 +278,7 @@ ENUM_LANGOPT(SYCLVersion , SYCLMajorVersion, 2, SYCL_None, "Version of the SYCL standard used") LANGOPT(HIPUseNewLaunchAPI, 1, 0, "Use new kernel launching API for HIP") +LANGOPT(HIPUniformBlock, 1, 0, "Assume that HIP kernels are launched with uniform block sizes") LANGOPT(SizedDeallocation , 1, 0, "sized deallocation") LANGOPT(AlignedAllocation , 1, 0, "aligned allocation") Index: clang/include/clang/Driver/Options.td =================================================================== --- clang/include/clang/Driver/Options.td +++ clang/include/clang/Driver/Options.td @@ -1089,6 +1089,10 @@ NegFlag, BothFlags<[], " that kernel argument names are preserved (HIP only)">>, ShouldParseIf; +defm hip_uniform_block : BoolFOption<"hip-uniform-block", + LangOpts<"HIPUniformBlock">, DefaultTrue, + PosFlag, NegFlag, + BothFlags<[], " that kernels are launched with uniform block sizes">>; def hipspv_pass_plugin_EQ : Joined<["--"], "hipspv-pass-plugin=">, Group, MetaVarName<"">, HelpText<"path to a pass plugin for HIP to SPIR-V passes.">; Index: clang/lib/CodeGen/CGCall.cpp =================================================================== --- clang/lib/CodeGen/CGCall.cpp +++ clang/lib/CodeGen/CGCall.cpp @@ -2402,6 +2402,9 @@ llvm::toStringRef(CodeGenOpts.UniformWGSize)); } } + + if (TargetDecl->hasAttr() && getLangOpts().HIPUniformBlock) + FuncAttrs.addAttribute("uniform-work-group-size", "true"); } // Attach "no-builtins" attributes to: Index: clang/lib/CodeGen/Targets/AMDGPU.cpp =================================================================== --- clang/lib/CodeGen/Targets/AMDGPU.cpp +++ clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -401,13 +401,6 @@ if (FD) setFunctionDeclAttributes(FD, F, M); - const bool IsHIPKernel = - M.getLangOpts().HIP && FD && FD->hasAttr(); - - // TODO: This should be moved to language specific attributes instead. - if (IsHIPKernel) - F->addFnAttr("uniform-work-group-size", "true"); - if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics()) F->addFnAttr("amdgpu-unsafe-fp-atomics", "true"); Index: clang/lib/Driver/ToolChains/Clang.cpp =================================================================== --- clang/lib/Driver/ToolChains/Clang.cpp +++ clang/lib/Driver/ToolChains/Clang.cpp @@ -7245,6 +7245,8 @@ if (IsHIP) { CmdArgs.push_back("-fcuda-allow-variadic-functions"); Args.AddLastArg(CmdArgs, options::OPT_fgpu_default_stream_EQ); + Args.addOptOutFlag(CmdArgs, options::OPT_fhip_uniform_block, + options::OPT_fno_hip_uniform_block); } if (IsCudaDevice || IsHIPDevice) { Index: clang/test/CodeGenHIP/default-attributes.hip =================================================================== --- clang/test/CodeGenHIP/default-attributes.hip +++ clang/test/CodeGenHIP/default-attributes.hip @@ -5,6 +5,9 @@ // RUN: %clang_cc1 -O3 -triple amdgcn-amd-amdhsa -x hip -fno-ident -fcuda-is-device \ // RUN: -emit-llvm -o - %s | FileCheck -check-prefix=OPT %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -fno-ident -fcuda-is-device -fno-hip-uniform-block \ +// RUN: -emit-llvm -o - %s | FileCheck -check-prefix=NOUNIBLK %s + #define __device__ __attribute__((device)) #define __global__ __attribute__((global)) @@ -20,6 +23,12 @@ // OPT-NEXT: entry: // OPT-NEXT: ret void // +// NOUNIBLK: Function Attrs: convergent mustprogress noinline nounwind optnone +// NOUNIBLK-LABEL: define {{[^@]+}}@_Z4funcv +// NOUNIBLK-SAME: () #[[ATTR0:[0-9]+]] { +// NOUNIBLK-NEXT: entry: +// NOUNIBLK-NEXT: ret void +// __device__ void func() { } @@ -36,21 +45,34 @@ // OPT-NEXT: entry: // OPT-NEXT: ret void // +// NOUNIBLK: Function Attrs: convergent mustprogress noinline norecurse nounwind optnone +// NOUNIBLK-LABEL: define {{[^@]+}}@_Z6kernelv +// NOUNIBLK-SAME: () #[[ATTR1:[0-9]+]] { +// NOUNIBLK-NEXT: entry: +// NOUNIBLK-NEXT: ret void +// __global__ void kernel() { } //. -// OPTNONE: attributes #0 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } -// OPTNONE: attributes #1 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } +// OPTNONE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// OPTNONE: attributes #[[ATTR1]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } +//. +// OPT: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// OPT: attributes #[[ATTR1]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } +//. +// NOUNIBLK: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// NOUNIBLK: attributes #[[ATTR1]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } //. -// OPT: attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" } -// OPT: attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } +// OPTNONE: [[META0:![0-9]+]] = !{i32 1, !"amdgpu_code_object_version", i32 400} +// OPTNONE: [[META1:![0-9]+]] = !{i32 1, !"amdgpu_printf_kind", !"hostcall"} +// OPTNONE: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} //. -// OPTNONE: !0 = !{i32 1, !"amdgpu_code_object_version", i32 400} -// OPTNONE: !1 = !{i32 1, !"amdgpu_printf_kind", !"hostcall"} -// OPTNONE: !2 = !{i32 1, !"wchar_size", i32 4} +// OPT: [[META0:![0-9]+]] = !{i32 1, !"amdgpu_code_object_version", i32 400} +// OPT: [[META1:![0-9]+]] = !{i32 1, !"amdgpu_printf_kind", !"hostcall"} +// OPT: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} //. -// OPT: !0 = !{i32 1, !"amdgpu_code_object_version", i32 400} -// OPT: !1 = !{i32 1, !"amdgpu_printf_kind", !"hostcall"} -// OPT: !2 = !{i32 1, !"wchar_size", i32 4} +// NOUNIBLK: [[META0:![0-9]+]] = !{i32 1, !"amdgpu_code_object_version", i32 400} +// NOUNIBLK: [[META1:![0-9]+]] = !{i32 1, !"amdgpu_printf_kind", !"hostcall"} +// NOUNIBLK: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} //. Index: clang/test/Driver/hip-options.hip =================================================================== --- clang/test/Driver/hip-options.hip +++ clang/test/Driver/hip-options.hip @@ -169,3 +169,25 @@ // RUN: %clang -### -nogpuinc -nogpulib -fhip-fp32-correctly-rounded-divide-sqrt \ // RUN: --cuda-gpu-arch=gfx906 %s 2>&1 | FileCheck -check-prefixes=CRDS %s // CRDS-NOT: "-f{{(no-)?}}hip-fp32-correctly-rounded-divide-sqrt" + +// Check -fno-hip-uniform-block is passed to clang -cc1 but +// (default) -fhip-uniform-block is not. + +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -nogpuinc -nogpulib -fno-hip-uniform-block \ +// RUN: --cuda-gpu-arch=gfx906 %s 2>&1 | FileCheck -check-prefix=NOUNIBLK %s + +// NOUNIBLK: "-cc1"{{.*}} "-triple" "amdgcn-amd-amdhsa" {{.*}} "-fno-hip-uniform-block" +// NOUNIBLK: "-cc1"{{.*}} "-triple" "x86_64-unknown-linux-gnu" {{.*}} "-fno-hip-uniform-block" + +// RUN: %clang -### -nogpuinc -nogpulib -fhip-uniform-block \ +// RUN: --cuda-gpu-arch=gfx906 %s 2>&1 | FileCheck -check-prefix=UNIBLK %s + +// RUN: %clang -### -nogpuinc -nogpulib \ +// RUN: --cuda-gpu-arch=gfx906 %s 2>&1 | FileCheck -check-prefix=UNIBLK %s + +// UNIBLK-NOT: "-f{{(no-)?}}hip-uniform-block" + +// Check no warnings for -f[no-]uniform-block. + +// RUN: %clang -fdriver-only -Werror --target=x86_64-unknown-linux-gnu -nogpuinc -nogpulib -fno-hip-uniform-block \ +// RUN: -fhip-uniform-block --cuda-gpu-arch=gfx906 %s 2>&1 | count 0