Index: clang/include/clang/Basic/LangOptions.h =================================================================== --- clang/include/clang/Basic/LangOptions.h +++ clang/include/clang/Basic/LangOptions.h @@ -291,6 +291,13 @@ ExtendTo64 }; + enum class GPUDefaultStreamKind { + /// Legacy default stream + Legacy, + /// Per-thread default stream + PerThread, + }; + public: /// The used language standard. LangStandard::Kind LangStd; @@ -384,6 +391,9 @@ /// input is a header file (i.e. -x c-header). bool IsHeaderFile = false; + /// The default stream kind used for HIP kernel launching. + GPUDefaultStreamKind GPUDefaultStream; + LangOptions(); // Define accessors/mutators for language options of enumeration type. Index: clang/include/clang/Driver/Options.td =================================================================== --- clang/include/clang/Driver/Options.td +++ clang/include/clang/Driver/Options.td @@ -959,6 +959,13 @@ TargetOpts<"NVPTXUseShortPointers">, DefaultFalse, PosFlag, NegFlag>; +def fgpu_default_stream_EQ : Joined<["-"], "fgpu-default-stream=">, + HelpText<"Specify default stream. Valid values are 'legacy' and 'per-thread'. The default value is 'legacy'. (HIP only)">, + Flags<[CC1Option]>, + Values<"legacy,per-thread">, + NormalizedValuesScope<"LangOptions::GPUDefaultStreamKind">, + NormalizedValues<["Legacy", "PerThread"]>, + MarshallingInfoEnum, "Legacy">; def rocm_path_EQ : Joined<["--"], "rocm-path=">, Group, HelpText<"ROCm installation path, used for finding and automatically linking required bitcode libraries.">; def hip_path_EQ : Joined<["--"], "hip-path=">, Group, Index: clang/lib/CodeGen/CGCUDANV.cpp =================================================================== --- clang/lib/CodeGen/CGCUDANV.cpp +++ clang/lib/CodeGen/CGCUDANV.cpp @@ -332,15 +332,22 @@ llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); // Lookup cudaLaunchKernel/hipLaunchKernel function. + // HIP kernel launching API name depends on -fgpu-default-stream option. For + // the default value 'legacy', it is hipLaunchKernel. For 'per-thread', + // it is hipLaunchKernel_spt. // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, // void **args, size_t sharedMem, // cudaStream_t stream); - // hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, - // void **args, size_t sharedMem, - // hipStream_t stream); + // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim, + // dim3 blockDim, void **args, + // size_t sharedMem, hipStream_t stream); TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl(); DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl); - auto LaunchKernelName = addPrefixToName("LaunchKernel"); + std::string KernelLaunchAPI = "LaunchKernel"; + if (CGF.getLangOpts().HIP && CGF.getLangOpts().GPUDefaultStream == + LangOptions::GPUDefaultStreamKind::PerThread) + KernelLaunchAPI = KernelLaunchAPI + "_spt"; + auto LaunchKernelName = addPrefixToName(KernelLaunchAPI); IdentifierInfo &cudaLaunchKernelII = CGM.getContext().Idents.get(LaunchKernelName); FunctionDecl *cudaLaunchKernelFD = nullptr; Index: clang/lib/Driver/ToolChains/Clang.cpp =================================================================== --- clang/lib/Driver/ToolChains/Clang.cpp +++ clang/lib/Driver/ToolChains/Clang.cpp @@ -6896,8 +6896,10 @@ CmdArgs.push_back(Args.MakeArgString(Twine("-cuid=") + Twine(CUID))); } - if (IsHIP) + if (IsHIP) { CmdArgs.push_back("-fcuda-allow-variadic-functions"); + Args.AddLastArg(CmdArgs, options::OPT_fgpu_default_stream_EQ); + } if (IsCudaDevice || IsHIPDevice) { StringRef InlineThresh = Index: clang/lib/Frontend/InitPreprocessor.cpp =================================================================== --- clang/lib/Frontend/InitPreprocessor.cpp +++ clang/lib/Frontend/InitPreprocessor.cpp @@ -538,6 +538,9 @@ Builder.defineMacro("__HIP_MEMORY_SCOPE_SYSTEM", "5"); if (LangOpts.CUDAIsDevice) Builder.defineMacro("__HIP_DEVICE_COMPILE__"); + if (LangOpts.GPUDefaultStream == + LangOptions::GPUDefaultStreamKind::PerThread) + Builder.defineMacro("HIP_API_PER_THREAD_DEFAULT_STREAM"); } } Index: clang/test/CodeGenCUDA/Inputs/cuda.h =================================================================== --- clang/test/CodeGenCUDA/Inputs/cuda.h +++ clang/test/CodeGenCUDA/Inputs/cuda.h @@ -35,11 +35,18 @@ extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0, hipStream_t stream = 0); +#ifndef HIP_API_PER_THREAD_DEFAULT_STREAM extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, hipStream_t stream); #else +extern "C" hipError_t hipLaunchKernel_spt(const void *func, dim3 gridDim, + dim3 blockDim, void **args, + size_t sharedMem, + hipStream_t stream); +#endif //HIP_API_PER_THREAD_DEFAULT_STREAM +#else typedef struct cudaStream *cudaStream_t; typedef enum cudaError {} cudaError_t; extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize, Index: clang/test/CodeGenCUDA/kernel-call.cu =================================================================== --- clang/test/CodeGenCUDA/kernel-call.cu +++ clang/test/CodeGenCUDA/kernel-call.cu @@ -5,7 +5,13 @@ // RUN: %clang_cc1 -x hip -emit-llvm %s -o - \ // RUN: | FileCheck %s --check-prefixes=HIP-OLD,CHECK // RUN: %clang_cc1 -fhip-new-launch-api -x hip -emit-llvm %s -o - \ -// RUN: | FileCheck %s --check-prefixes=HIP-NEW,CHECK +// RUN: | FileCheck %s --check-prefixes=HIP-NEW,LEGACY,CHECK +// RUN: %clang_cc1 -fhip-new-launch-api -x hip -emit-llvm %s -o - \ +// RUN: -fgpu-default-stream=legacy \ +// RUN: | FileCheck %s --check-prefixes=HIP-NEW,LEGACY,CHECK +// RUN: %clang_cc1 -fhip-new-launch-api -x hip -emit-llvm %s -o - \ +// RUN: -fgpu-default-stream=per-thread -DHIP_API_PER_THREAD_DEFAULT_STREAM \ +// RUN: | FileCheck %s --check-prefixes=HIP-NEW,PTH,CHECK #include "Inputs/cuda.h" @@ -13,7 +19,8 @@ // HIP-OLD: call{{.*}}hipSetupArgument // HIP-OLD: call{{.*}}hipLaunchByPtr // HIP-NEW: call{{.*}}__hipPopCallConfiguration -// HIP-NEW: call{{.*}}hipLaunchKernel +// LEGACY: call{{.*}}hipLaunchKernel +// PTH: call{{.*}}hipLaunchKernel_spt // CUDA-OLD: call{{.*}}cudaSetupArgument // CUDA-OLD: call{{.*}}cudaLaunch // CUDA-NEW: call{{.*}}__cudaPopCallConfiguration Index: clang/test/Driver/hip-options.hip =================================================================== --- clang/test/Driver/hip-options.hip +++ clang/test/Driver/hip-options.hip @@ -14,6 +14,14 @@ // DEVINIT: clang{{.*}}" "-cc1" {{.*}}"-fgpu-allow-device-init" // DEVINIT: clang{{.*}}" "-cc1" {{.*}}"-fgpu-allow-device-init" +// Check -fgpu-default-stream=per-thread. +// RUN: %clang -### -nogpuinc -nogpulib -fgpu-default-stream=per-thread \ +// RUN: %s -save-temps 2>&1 | FileCheck -check-prefix=PTH %s +// PTH: clang{{.*}}" "-cc1" {{.*}}"-E" {{.*}}"-fgpu-default-stream=per-thread" +// PTH: clang{{.*}}" "-cc1" {{.*}}"-fgpu-default-stream=per-thread" {{.*}}"-x" "hip-cpp-output" +// PTH: clang{{.*}}" "-cc1" {{.*}}"-E" {{.*}}"-fgpu-default-stream=per-thread" +// PTH: clang{{.*}}" "-cc1" {{.*}}"-fgpu-default-stream=per-thread" {{.*}}"-x" "hip-cpp-output" + // RUN: %clang -### -x hip -target x86_64-pc-windows-msvc -fms-extensions \ // RUN: -mllvm -amdgpu-early-inline-all=true %s 2>&1 | \ // RUN: FileCheck -check-prefix=MLLVM %s Index: clang/test/Preprocessor/predefined-macros.c =================================================================== --- clang/test/Preprocessor/predefined-macros.c +++ clang/test/Preprocessor/predefined-macros.c @@ -247,6 +247,7 @@ // CHECK-HIP-NEG-NOT: #define __CUDA_ARCH__ // CHECK-HIP-NEG-NOT: #define __HIP_DEVICE_COMPILE__ 1 // CHECK-HIP-NEG-NOT: #define __CLANG_RDC__ 1 +// CHECK-HIP-NEG-NOT: #define HIP_API_PER_THREAD_DEFAULT_STREAM // RUN: %clang_cc1 %s -E -dM -o - -x hip -triple amdgcn-amd-amdhsa \ // RUN: -fcuda-is-device \ @@ -265,6 +266,7 @@ // RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-HIP-DEV-NEG // CHECK-HIP-DEV-NEG-NOT: #define __CUDA_ARCH__ // CHECK-HIP-DEV-NEG-NOT: #define __CLANG_RDC__ 1 +// CHECK-HIP-DEV-NEG-NOT: #define HIP_API_PER_THREAD_DEFAULT_STREAM // RUN: %clang_cc1 %s -E -dM -o - -x cuda -triple x86_64-unknown-linux-gnu \ // RUN: -fgpu-rdc | FileCheck %s --check-prefix=CHECK-RDC @@ -277,3 +279,11 @@ // RUN: -fgpu-rdc -fcuda-is-device \ // RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-RDC // CHECK-RDC: #define __CLANG_RDC__ 1 + +// RUN: %clang_cc1 %s -E -dM -o - -x hip -triple x86_64-unknown-linux-gnu \ +// RUN: -fgpu-default-stream=per-thread \ +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-PTH +// RUN: %clang_cc1 %s -E -dM -o - -x hip -triple amdgcn-amd-amdhsa \ +// RUN: -fcuda-is-device -fgpu-default-stream=per-thread \ +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-PTH +// CHECK-PTH: #define HIP_API_PER_THREAD_DEFAULT_STREAM 1