diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp --- a/clang/lib/CodeGen/CGCUDANV.cpp +++ b/clang/lib/CodeGen/CGCUDANV.cpp @@ -358,9 +358,13 @@ TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl(); DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl); std::string KernelLaunchAPI = "LaunchKernel"; - if (CGF.getLangOpts().HIP && CGF.getLangOpts().GPUDefaultStream == - LangOptions::GPUDefaultStreamKind::PerThread) - KernelLaunchAPI = KernelLaunchAPI + "_spt"; + if (CGF.getLangOpts().GPUDefaultStream == + LangOptions::GPUDefaultStreamKind::PerThread) { + if (CGF.getLangOpts().HIP) + KernelLaunchAPI = KernelLaunchAPI + "_spt"; + else if (CGF.getLangOpts().CUDA) + KernelLaunchAPI = KernelLaunchAPI + "_ptsz"; + } auto LaunchKernelName = addPrefixToName(KernelLaunchAPI); IdentifierInfo &cudaLaunchKernelII = CGM.getContext().Idents.get(LaunchKernelName); diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -574,6 +574,9 @@ Builder.defineMacro("__CLANG_RDC__"); if (!LangOpts.HIP) Builder.defineMacro("__CUDA__"); + if (LangOpts.GPUDefaultStream == + LangOptions::GPUDefaultStreamKind::PerThread) + Builder.defineMacro("CUDA_API_PER_THREAD_DEFAULT_STREAM"); } if (LangOpts.HIP) { Builder.defineMacro("__HIP__"); diff --git a/clang/test/CodeGenCUDA/Inputs/cuda.h b/clang/test/CodeGenCUDA/Inputs/cuda.h --- a/clang/test/CodeGenCUDA/Inputs/cuda.h +++ b/clang/test/CodeGenCUDA/Inputs/cuda.h @@ -58,6 +58,10 @@ extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream); +extern "C" cudaError_t cudaLaunchKernel_ptsz(const void *func, dim3 gridDim, + dim3 blockDim, void **args, + size_t sharedMem, cudaStream_t stream); + #endif extern "C" __device__ int printf(const char*, ...); diff --git a/clang/test/CodeGenCUDA/kernel-call.cu b/clang/test/CodeGenCUDA/kernel-call.cu --- a/clang/test/CodeGenCUDA/kernel-call.cu +++ b/clang/test/CodeGenCUDA/kernel-call.cu @@ -2,6 +2,9 @@ // RUN: | FileCheck %s --check-prefixes=CUDA-OLD,CHECK // RUN: %clang_cc1 -target-sdk-version=9.2 -emit-llvm %s -o - \ // RUN: | FileCheck %s --check-prefixes=CUDA-NEW,CHECK +// RUN: %clang_cc1 -target-sdk-version=9.2 -emit-llvm %s -o - \ +// RUN: -fgpu-default-stream=per-thread -DCUDA_API_PER_THREAD_DEFAULT_STREAM \ +// RUN: | FileCheck %s --check-prefixes=CUDA-PTH,CHECK // RUN: %clang_cc1 -x hip -emit-llvm %s -o - \ // RUN: | FileCheck %s --check-prefixes=HIP-OLD,CHECK // RUN: %clang_cc1 -fhip-new-launch-api -x hip -emit-llvm %s -o - \ @@ -25,6 +28,7 @@ // CUDA-OLD: call{{.*}}cudaLaunch // CUDA-NEW: call{{.*}}__cudaPopCallConfiguration // CUDA-NEW: call{{.*}}cudaLaunchKernel +// CUDA-PTH: call{{.*}}cudaLaunchKernel_ptsz __global__ void g1(int x) {} // CHECK-LABEL: define{{.*}}main