Index: clang/include/clang/Basic/LangOptions.def =================================================================== --- clang/include/clang/Basic/LangOptions.def +++ clang/include/clang/Basic/LangOptions.def @@ -231,7 +231,7 @@ LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions") LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code") LANGOPT(GPUAllowDeviceInit, 1, 0, "allowing device side global init functions for HIP") -LANGOPT(GPUMaxThreadsPerBlock, 32, 256, "default max threads per block for kernel launch bounds for HIP") +LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max threads per block for kernel launch bounds for HIP") LANGOPT(SYCL , 1, 0, "SYCL") LANGOPT(SYCLIsDevice , 1, 0, "Generate code for SYCL device") Index: clang/lib/CodeGen/TargetInfo.cpp =================================================================== --- clang/lib/CodeGen/TargetInfo.cpp +++ clang/lib/CodeGen/TargetInfo.cpp @@ -8119,9 +8119,13 @@ assert(Max == 0 && "Max must be zero"); } else if (IsOpenCLKernel || IsHIPKernel) { // By default, restrict the maximum size to a value specified by - // --gpu-max-threads-per-block=n or its default value. + // --gpu-max-threads-per-block=n or its default value for HIP. + const unsigned OpenCLDefaultMaxWorkGroupSize = 256; + const unsigned DefaultMaxWorkGroupSize = + IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize + : M.getLangOpts().GPUMaxThreadsPerBlock; std::string AttrVal = - std::string("1,") + llvm::utostr(M.getLangOpts().GPUMaxThreadsPerBlock); + std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize); F->addFnAttr("amdgpu-flat-work-group-size", AttrVal); } Index: clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu =================================================================== --- clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu +++ clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu @@ -39,7 +39,7 @@ // NAMD-NOT: "amdgpu-num-vgpr" // NAMD-NOT: "amdgpu-num-sgpr" -// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,256"{{.*}}"uniform-work-group-size"="true" +// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"{{.*}}"uniform-work-group-size"="true" // MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024" // CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = {{.*}}"amdgpu-flat-work-group-size"="32,64" // CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2" Index: clang/test/CodeGenCUDA/kernel-amdgcn.cu =================================================================== --- clang/test/CodeGenCUDA/kernel-amdgcn.cu +++ clang/test/CodeGenCUDA/kernel-amdgcn.cu @@ -39,4 +39,4 @@ launch((void*)D.Empty()); return 0; } -// CHECK: attributes #[[ATTR]] = {{.*}}"amdgpu-flat-work-group-size"="1,256" +// CHECK: attributes #[[ATTR]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"