Index: cfe/trunk/include/clang/Basic/LangOptions.def =================================================================== --- cfe/trunk/include/clang/Basic/LangOptions.def +++ cfe/trunk/include/clang/Basic/LangOptions.def @@ -226,6 +226,8 @@ LANGOPT(SYCLIsDevice , 1, 0, "Generate code for SYCL device") +LANGOPT(HIPUseNewLaunchAPI, 1, 0, "Use new kernel launching API for HIP") + LANGOPT(SizedDeallocation , 1, 0, "sized deallocation") LANGOPT(AlignedAllocation , 1, 0, "aligned allocation") LANGOPT(AlignedAllocationUnavailable, 1, 0, "aligned allocation functions are unavailable") Index: cfe/trunk/include/clang/Driver/Options.td =================================================================== --- cfe/trunk/include/clang/Driver/Options.td +++ cfe/trunk/include/clang/Driver/Options.td @@ -599,6 +599,9 @@ HelpText<"HIP device library">; def fhip_dump_offload_linker_script : Flag<["-"], "fhip-dump-offload-linker-script">, Group, Flags<[NoArgumentUnused, HelpHidden]>; +def fhip_new_launch_api : Flag<["-"], "fhip-new-launch-api">, + Flags<[CC1Option]>, HelpText<"Use new kernel launching API for HIP.">; +def fno_hip_new_launch_api : Flag<["-"], "fno-hip-new-launch-api">; def libomptarget_nvptx_path_EQ : Joined<["--"], "libomptarget-nvptx-path=">, Group, HelpText<"Path to libomptarget-nvptx libraries">; def dD : Flag<["-"], "dD">, Group, Flags<[CC1Option]>, Index: cfe/trunk/lib/CodeGen/CGCUDANV.cpp =================================================================== --- cfe/trunk/lib/CodeGen/CGCUDANV.cpp +++ cfe/trunk/lib/CodeGen/CGCUDANV.cpp @@ -236,7 +236,8 @@ EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl}); if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), - CudaFeature::CUDA_USES_NEW_LAUNCH)) + CudaFeature::CUDA_USES_NEW_LAUNCH) || + CGF.getLangOpts().HIPUseNewLaunchAPI) emitDeviceStubBodyNew(CGF, Args); else emitDeviceStubBodyLegacy(CGF, Args); @@ -264,14 +265,18 @@ llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); - // Lookup cudaLaunchKernel function. + // Lookup cudaLaunchKernel/hipLaunchKernel function. // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, // void **args, size_t sharedMem, // cudaStream_t stream); + // hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, + // void **args, size_t sharedMem, + // hipStream_t stream); TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl(); DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl); + auto LaunchKernelName = addPrefixToName("LaunchKernel"); IdentifierInfo &cudaLaunchKernelII = - CGM.getContext().Idents.get("cudaLaunchKernel"); + CGM.getContext().Idents.get(LaunchKernelName); FunctionDecl *cudaLaunchKernelFD = nullptr; for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) { if (FunctionDecl *FD = dyn_cast(Result)) @@ -280,7 +285,7 @@ if (cudaLaunchKernelFD == nullptr) { CGM.Error(CGF.CurFuncDecl->getLocation(), - "Can't find declaration for cudaLaunchKernel()"); + "Can't find declaration for " + LaunchKernelName); return; } // Create temporary dim3 grid_dim, block_dim. @@ -301,7 +306,7 @@ /*ShmemSize=*/ShmemSize.getType(), /*Stream=*/Stream.getType()}, /*isVarArg=*/false), - "__cudaPopCallConfiguration"); + addUnderscoredPrefixToName("PopCallConfiguration")); CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, {GridDim.getPointer(), BlockDim.getPointer(), @@ -329,7 +334,7 @@ const CGFunctionInfo &FI = CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD); llvm::FunctionCallee cudaLaunchKernelFn = - CGM.CreateRuntimeFunction(FTy, "cudaLaunchKernel"); + CGM.CreateRuntimeFunction(FTy, LaunchKernelName); CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(), LaunchKernelArgs); CGF.EmitBranch(EndBlock); Index: cfe/trunk/lib/Driver/ToolChains/Clang.cpp =================================================================== --- cfe/trunk/lib/Driver/ToolChains/Clang.cpp +++ cfe/trunk/lib/Driver/ToolChains/Clang.cpp @@ -4774,6 +4774,10 @@ // Forward -cl options to -cc1 RenderOpenCLOptions(Args, CmdArgs); + if (Args.hasFlag(options::OPT_fhip_new_launch_api, + options::OPT_fno_hip_new_launch_api, false)) + CmdArgs.push_back("-fhip-new-launch-api"); + if (Arg *A = Args.getLastArg(options::OPT_fcf_protection_EQ)) { CmdArgs.push_back( Args.MakeArgString(Twine("-fcf-protection=") + A->getValue())); Index: cfe/trunk/lib/Frontend/CompilerInvocation.cpp =================================================================== --- cfe/trunk/lib/Frontend/CompilerInvocation.cpp +++ cfe/trunk/lib/Frontend/CompilerInvocation.cpp @@ -2517,6 +2517,7 @@ Opts.CUDADeviceApproxTranscendentals = 1; Opts.GPURelocatableDeviceCode = Args.hasArg(OPT_fgpu_rdc); + Opts.HIPUseNewLaunchAPI = Args.hasArg(OPT_fhip_new_launch_api); if (Opts.ObjC) { if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) { Index: cfe/trunk/lib/Sema/SemaCUDA.cpp =================================================================== --- cfe/trunk/lib/Sema/SemaCUDA.cpp +++ cfe/trunk/lib/Sema/SemaCUDA.cpp @@ -820,7 +820,8 @@ std::string Sema::getCudaConfigureFuncName() const { if (getLangOpts().HIP) - return "hipConfigureCall"; + return getLangOpts().HIPUseNewLaunchAPI ? "__hipPushCallConfiguration" + : "hipConfigureCall"; // New CUDA kernel launch sequence. if (CudaFeatureEnabled(Context.getTargetInfo().getSDKVersion(), Index: cfe/trunk/test/CodeGenCUDA/Inputs/cuda.h =================================================================== --- cfe/trunk/test/CodeGenCUDA/Inputs/cuda.h +++ cfe/trunk/test/CodeGenCUDA/Inputs/cuda.h @@ -14,12 +14,21 @@ __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {} }; -typedef struct cudaStream *cudaStream_t; -typedef enum cudaError {} cudaError_t; #ifdef __HIP__ +typedef struct hipStream *hipStream_t; +typedef enum hipError {} hipError_t; int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0, - cudaStream_t stream = 0); + hipStream_t stream = 0); +extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize, + size_t sharedSize = 0, + hipStream_t stream = 0); +extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim, + dim3 blockDim, void **args, + size_t sharedMem, + hipStream_t stream); #else +typedef struct cudaStream *cudaStream_t; +typedef enum cudaError {} cudaError_t; extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0, cudaStream_t stream = 0); Index: cfe/trunk/test/CodeGenCUDA/kernel-call.cu =================================================================== --- cfe/trunk/test/CodeGenCUDA/kernel-call.cu +++ cfe/trunk/test/CodeGenCUDA/kernel-call.cu @@ -3,14 +3,17 @@ // RUN: %clang_cc1 -target-sdk-version=9.2 -emit-llvm %s -o - \ // RUN: | FileCheck %s --check-prefixes=CUDA-NEW,CHECK // RUN: %clang_cc1 -x hip -emit-llvm %s -o - \ -// RUN: | FileCheck %s --check-prefixes=HIP,CHECK - +// RUN: | FileCheck %s --check-prefixes=HIP-OLD,CHECK +// RUN: %clang_cc1 -fhip-new-launch-api -x hip -emit-llvm %s -o - \ +// RUN: | FileCheck %s --check-prefixes=HIP-NEW,CHECK #include "Inputs/cuda.h" // CHECK-LABEL: define{{.*}}g1 -// HIP: call{{.*}}hipSetupArgument -// HIP: call{{.*}}hipLaunchByPtr +// HIP-OLD: call{{.*}}hipSetupArgument +// HIP-OLD: call{{.*}}hipLaunchByPtr +// HIP-NEW: call{{.*}}__hipPopCallConfiguration +// HIP-NEW: call{{.*}}hipLaunchKernel // CUDA-OLD: call{{.*}}cudaSetupArgument // CUDA-OLD: call{{.*}}cudaLaunch // CUDA-NEW: call{{.*}}__cudaPopCallConfiguration @@ -19,7 +22,8 @@ // CHECK-LABEL: define{{.*}}main int main(void) { - // HIP: call{{.*}}hipConfigureCall + // HIP-OLD: call{{.*}}hipConfigureCall + // HIP-NEW: call{{.*}}__hipPushCallConfiguration // CUDA-OLD: call{{.*}}cudaConfigureCall // CUDA-NEW: call{{.*}}__cudaPushCallConfiguration // CHECK: icmp