Index: clang/include/clang/Basic/LangOptions.h =================================================================== --- clang/include/clang/Basic/LangOptions.h +++ clang/include/clang/Basic/LangOptions.h @@ -291,6 +291,13 @@ ExtendTo64 }; + enum class DefaultStreamKind { + /// Legacy default stream + Legacy, + /// Per-thread default stream + PerThread, + }; + public: /// The used language standard. LangStandard::Kind LangStd; @@ -384,6 +391,9 @@ /// input is a header file (i.e. -x c-header). bool IsHeaderFile = false; + /// The default stream kind used for HIP kernel launching. + DefaultStreamKind DefaultStream; + LangOptions(); // Define accessors/mutators for language options of enumeration type. Index: clang/include/clang/Driver/Options.td =================================================================== --- clang/include/clang/Driver/Options.td +++ clang/include/clang/Driver/Options.td @@ -959,6 +959,14 @@ TargetOpts<"NVPTXUseShortPointers">, DefaultFalse, PosFlag, NegFlag>; +def default_stream_EQ : Joined<["--"], "default-stream=">, + HelpText<"Specify default stream. Valid values are 'legacy' and 'per-thread'. The default value is 'legacy'. (HIP only)">, + Flags<[CC1Option]>, + Values<"legacy,per-thread">, + NormalizedValuesScope<"LangOptions::DefaultStreamKind">, + NormalizedValues<["Legacy", "PerThread"]>, + MarshallingInfoEnum, "Legacy">; + def rocm_path_EQ : Joined<["--"], "rocm-path=">, Group, HelpText<"ROCm installation path, used for finding and automatically linking required bitcode libraries.">; def hip_path_EQ : Joined<["--"], "hip-path=">, Group, Index: clang/lib/CodeGen/CGCUDANV.cpp =================================================================== --- clang/lib/CodeGen/CGCUDANV.cpp +++ clang/lib/CodeGen/CGCUDANV.cpp @@ -332,15 +332,22 @@ llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); // Lookup cudaLaunchKernel/hipLaunchKernel function. + // HIP kernel launching API name depends on --default-stream option. For + // the default value 'legacy', it is hipLaunchKernel. For 'per-thread', + // it is hipLaunchKernel_spt. // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, // void **args, size_t sharedMem, // cudaStream_t stream); - // hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, - // void **args, size_t sharedMem, - // hipStream_t stream); + // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim, + // dim3 blockDim, void **args, + // size_t sharedMem, hipStream_t stream); TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl(); DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl); - auto LaunchKernelName = addPrefixToName("LaunchKernel"); + std::string KernelLaunchAPI = "LaunchKernel"; + if (CGF.getLangOpts().HIP && CGF.getLangOpts().DefaultStream == + LangOptions::DefaultStreamKind::PerThread) + KernelLaunchAPI = KernelLaunchAPI + "_spt"; + auto LaunchKernelName = addPrefixToName(KernelLaunchAPI); IdentifierInfo &cudaLaunchKernelII = CGM.getContext().Idents.get(LaunchKernelName); FunctionDecl *cudaLaunchKernelFD = nullptr; Index: clang/lib/Driver/ToolChains/Clang.cpp =================================================================== --- clang/lib/Driver/ToolChains/Clang.cpp +++ clang/lib/Driver/ToolChains/Clang.cpp @@ -6896,8 +6896,16 @@ CmdArgs.push_back(Args.MakeArgString(Twine("-cuid=") + Twine(CUID))); } - if (IsHIP) + if (IsHIP) { CmdArgs.push_back("-fcuda-allow-variadic-functions"); + auto DefStream = Args.getLastArgValue(options::OPT_default_stream_EQ); + if (!DefStream.empty()) { + Args.AddLastArg(CmdArgs, options::OPT_default_stream_EQ); + if (DefStream == "per-thread") + CmdArgs.push_back( + Args.MakeArgString("-DHIP_API_PER_THREAD_DEFAULT_STREAM")); + } + } if (IsCudaDevice || IsHIPDevice) { StringRef InlineThresh = Index: clang/test/CodeGenCUDA/Inputs/cuda.h =================================================================== --- clang/test/CodeGenCUDA/Inputs/cuda.h +++ clang/test/CodeGenCUDA/Inputs/cuda.h @@ -35,11 +35,18 @@ extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0, hipStream_t stream = 0); +#ifndef HIP_API_PER_THREAD_DEFAULT_STREAM extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, hipStream_t stream); #else +extern "C" hipError_t hipLaunchKernel_spt(const void *func, dim3 gridDim, + dim3 blockDim, void **args, + size_t sharedMem, + hipStream_t stream); +#endif //HIP_API_PER_THREAD_DEFAULT_STREAM +#else typedef struct cudaStream *cudaStream_t; typedef enum cudaError {} cudaError_t; extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize, Index: clang/test/CodeGenCUDA/kernel-call.cu =================================================================== --- clang/test/CodeGenCUDA/kernel-call.cu +++ clang/test/CodeGenCUDA/kernel-call.cu @@ -5,7 +5,13 @@ // RUN: %clang_cc1 -x hip -emit-llvm %s -o - \ // RUN: | FileCheck %s --check-prefixes=HIP-OLD,CHECK // RUN: %clang_cc1 -fhip-new-launch-api -x hip -emit-llvm %s -o - \ -// RUN: | FileCheck %s --check-prefixes=HIP-NEW,CHECK +// RUN: | FileCheck %s --check-prefixes=HIP-NEW,LEGACY,CHECK +// RUN: %clang_cc1 -fhip-new-launch-api -x hip -emit-llvm %s -o - \ +// RUN: --default-stream=legacy \ +// RUN: | FileCheck %s --check-prefixes=HIP-NEW,LEGACY,CHECK +// RUN: %clang_cc1 -fhip-new-launch-api -x hip -emit-llvm %s -o - \ +// RUN: --default-stream=per-thread -DHIP_API_PER_THREAD_DEFAULT_STREAM \ +// RUN: | FileCheck %s --check-prefixes=HIP-NEW,PTH,CHECK #include "Inputs/cuda.h" @@ -13,7 +19,8 @@ // HIP-OLD: call{{.*}}hipSetupArgument // HIP-OLD: call{{.*}}hipLaunchByPtr // HIP-NEW: call{{.*}}__hipPopCallConfiguration -// HIP-NEW: call{{.*}}hipLaunchKernel +// LEGACY: call{{.*}}hipLaunchKernel +// PTH: call{{.*}}hipLaunchKernel_spt // CUDA-OLD: call{{.*}}cudaSetupArgument // CUDA-OLD: call{{.*}}cudaLaunch // CUDA-NEW: call{{.*}}__cudaPopCallConfiguration Index: clang/test/Driver/hip-options.hip =================================================================== --- clang/test/Driver/hip-options.hip +++ clang/test/Driver/hip-options.hip @@ -14,6 +14,12 @@ // DEVINIT: clang{{.*}}" "-cc1" {{.*}}"-fgpu-allow-device-init" // DEVINIT: clang{{.*}}" "-cc1" {{.*}}"-fgpu-allow-device-init" +// Check --default-stream=per-thread. +// RUN: %clang -### -nogpuinc -nogpulib --default-stream=per-thread \ +// RUN: %s 2>&1 | FileCheck -check-prefix=PTH %s +// PTH: clang{{.*}}" "-cc1" {{.*}}"--default-stream=per-thread"{{.*}}"-DHIP_API_PER_THREAD_DEFAULT_STREAM" +// PTH: clang{{.*}}" "-cc1" {{.*}}"--default-stream=per-thread"{{.*}}"-DHIP_API_PER_THREAD_DEFAULT_STREAM" + // RUN: %clang -### -x hip -target x86_64-pc-windows-msvc -fms-extensions \ // RUN: -mllvm -amdgpu-early-inline-all=true %s 2>&1 | \ // RUN: FileCheck -check-prefix=MLLVM %s